rm(list = ls())
# For reproducibility purposes
set.seed(123)
# List of required libraries
required_libraries <- c("skimr", "ggplot2","dplyr", "VIM", "dbscan", "RColorBrewer", "isotree" ,"e1071", "bestNormalize","caret", "GGally", "corrplot", "rpart", "rpart.plot", "MASS","biotools", "klaR", "iml", "fastshap","randomForest", "gbm", "xgboost","reshape2", "pdp","pROC", "shapviz")
# Install and load libraries
suppressWarnings({
for (lib in required_libraries) {
if (!require(lib, character.only = TRUE)) {
install.packages(lib, dependencies = TRUE)
library(lib, character.only = TRUE)
}
}
})
## Loading required package: skimr
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: VIM
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
## Loading required package: dbscan
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:VIM':
##
## kNN
## The following object is masked from 'package:stats':
##
## as.dendrogram
## Loading required package: RColorBrewer
## Loading required package: isotree
## Loading required package: e1071
## Loading required package: bestNormalize
## Loading required package: caret
## Loading required package: lattice
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading required package: corrplot
## corrplot 0.92 loaded
## Loading required package: rpart
## Loading required package: rpart.plot
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:bestNormalize':
##
## boxcox
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: biotools
## ---
## biotools version 4.2
## Loading required package: klaR
## Loading required package: iml
## Loading required package: fastshap
##
## Attaching package: 'fastshap'
## The following object is masked from 'package:dplyr':
##
## explain
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: gbm
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
## Loading required package: xgboost
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
## Loading required package: reshape2
## Loading required package: pdp
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Loading required package: shapviz
We will start by importing the dataset and interpreting both its structure and content.
data <- read.csv("train.csv", sep = ",")
skim(data)
| Name | data |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
| Street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| Alley | 1369 | 0.06 | 4 | 4 | 0 | 2 | 0 |
| LotShape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| LandContour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| Utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| LotConfig | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| LandSlope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| Neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
| Condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| Condition2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| BldgType | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| HouseStyle | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| RoofStyle | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| RoofMatl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
| Exterior1st | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
| Exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
| MasVnrType | 8 | 0.99 | 4 | 7 | 0 | 4 | 0 |
| ExterQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| ExterCond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| Foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| BsmtQual | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtCond | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtExposure | 38 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtFinType1 | 37 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| BsmtFinType2 | 38 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| Heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| HeatingQC | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| CentralAir | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| Electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| KitchenQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| Functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| FireplaceQu | 690 | 0.53 | 2 | 2 | 0 | 5 | 0 |
| GarageType | 81 | 0.94 | 6 | 7 | 0 | 6 | 0 |
| GarageFinish | 81 | 0.94 | 3 | 3 | 0 | 3 | 0 |
| GarageQual | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| GarageCond | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| PavedDrive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| PoolQC | 1453 | 0.00 | 2 | 2 | 0 | 3 | 0 |
| Fence | 1179 | 0.19 | 4 | 5 | 0 | 4 | 0 |
| MiscFeature | 1406 | 0.04 | 4 | 4 | 0 | 4 | 0 |
| SaleType | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
| SaleCondition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| X1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| X2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
| X3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
With a first visualization of the dataset we can see that 43 of the columns are categorical while 38 of them are numerical. We can also observe some missing values as well as some redundant columns, like Id.
Before doing any other analysis or operations with the data we will split into training and testing sets. This is because we will treat the test data as “future” data, and looking at it during development could unintentionally bias the models.
Furthermore, since the dataset is relatively small (~1500 observations), we will reserve a 20% of data for testing The remaining 80% will be used for training, since we will also use cross-validation to evaluate any model. This ensures that we have sufficient data to have robust predictions and evaluations.
split <- createDataPartition(data$SalePrice, p = 0.8, list = FALSE)
training <- data[split,]
testing <- data[-split,]
nrow(training)
## [1] 1169
nrow(testing)
## [1] 291
Before training any model, it is mandatory to perform a thorough preprocessing. This does not only consist on correcting any errors on the data, but also on transforming it into a format that the algorithms that will be applied can handle. Moreover, it is also a key step to improve model performance, since eliminating noise will positively impact the precision of predictions.
We observed before that there were some columns that had many NA values. We will use a barplot to visualize this better and decide what to do with them.
# Barplot with the number of NA values
barplot(colMeans(is.na(training)), las=2)
Based on the previous plot and the skim function used we can see that there are some variables which have NA values for most if not almost all of the observations.
# Finding out the columns with a high number of NA values
colnames(training)[which(colMeans(is.na(training))>0.75)]
## [1] "Alley" "PoolQC" "Fence" "MiscFeature"
These columns are Alley, PoolQC, Fence and MiscFeature.
Alley refers to the type of alley access to the property. It can be Gravel or Paved, NA means no alley, almost all houses have an NA value here, we don’t think that the type of alley is of great importance, so we will remove it.
PoolQC refers to the quality of the swimming pool, as most houses don’t have a pool and there are other variables such as PoolArea which refers to the dimension of the swimming pool, we will remove it.
Fence refers to the quality of the fencing, following a similar explanation as before, it has too many NA values, it won’t provide much information
MiscFeature mentions other additional features houses may have or not, such as tennis courts. As this are really exclusive and as with the other variables there are a lot of missing values we will delete it.
# Deleting the variables
na_columns <- c("Alley","PoolQC","Fence", "MiscFeature")
training <- training[, !(colnames(training) %in% na_columns)]
# Obtaining the remaining number of NA values
ncol(training %>% filter(if_any(everything(), is.na)))
## [1] 77
# Obtaining the remaining number of NA values
nrow(training %>% filter(if_any(everything(), is.na)))
## [1] 716
We can see that we still have many observations with NA values, therefore we will now concentrate on replacing the missing values, taking into account their importance and if the missing variable is categorical or not.
To solve missing NA values we will separate the dataset into both numerical and categorical variables. Then we will work with both split datasets, and after dealing with the missing values we will merge them back.
# Separating our dataset into both numerical and categorical data
numeric_data <- select_if(training, is.numeric)
numeric_data <- numeric_data[,-1]
categorical_data <- select_if(training, is.character)
We will start working with numerical data as it will be a little bit easier. Firstly we will have a look at the different columns.
skim(numeric_data)
| Name | numeric_data |
| Number of rows | 1169 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| MSSubClass | 0 | 1.00 | 56.66 | 41.84 | 20 | 20 | 50 | 70 | 190 | ▇▅▂▁▁ |
| LotFrontage | 217 | 0.81 | 69.98 | 24.49 | 21 | 59 | 69 | 80 | 313 | ▇▃▁▁▁ |
| LotArea | 0 | 1.00 | 10504.23 | 9024.91 | 1300 | 7590 | 9480 | 11700 | 164660 | ▇▁▁▁▁ |
| OverallQual | 0 | 1.00 | 6.09 | 1.37 | 1 | 5 | 6 | 7 | 10 | ▁▂▇▅▁ |
| OverallCond | 0 | 1.00 | 5.55 | 1.11 | 1 | 5 | 5 | 6 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1.00 | 1971.32 | 30.23 | 1872 | 1954 | 1973 | 2000 | 2009 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.67 | 1950 | 1966 | 1994 | 2004 | 2010 | ▅▂▂▃▇ |
| MasVnrArea | 4 | 1.00 | 104.13 | 177.31 | 0 | 0 | 0 | 169 | 1378 | ▇▁▁▁▁ |
| BsmtFinSF1 | 0 | 1.00 | 440.40 | 455.63 | 0 | 0 | 377 | 712 | 5644 | ▇▁▁▁▁ |
| BsmtFinSF2 | 0 | 1.00 | 48.62 | 168.30 | 0 | 0 | 0 | 0 | 1474 | ▇▁▁▁▁ |
| BsmtUnfSF | 0 | 1.00 | 568.88 | 440.47 | 0 | 225 | 479 | 806 | 2153 | ▇▆▂▁▁ |
| TotalBsmtSF | 0 | 1.00 | 1057.90 | 439.51 | 0 | 793 | 992 | 1304 | 6110 | ▇▃▁▁▁ |
| X1stFlrSF | 0 | 1.00 | 1161.52 | 381.50 | 334 | 882 | 1086 | 1394 | 4692 | ▇▅▁▁▁ |
| X2ndFlrSF | 0 | 1.00 | 340.03 | 428.95 | 0 | 0 | 0 | 728 | 1796 | ▇▂▂▁▁ |
| LowQualFinSF | 0 | 1.00 | 6.23 | 50.03 | 0 | 0 | 0 | 0 | 572 | ▇▁▁▁▁ |
| GrLivArea | 0 | 1.00 | 1507.77 | 503.07 | 334 | 1138 | 1466 | 1768 | 5642 | ▇▇▁▁▁ |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0 | 0 | 1 | 3 | ▇▆▁▁▁ |
| BsmtHalfBath | 0 | 1.00 | 0.05 | 0.23 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| FullBath | 0 | 1.00 | 1.55 | 0.54 | 0 | 1 | 2 | 2 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0 | 0 | 1 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1.00 | 2.86 | 0.83 | 0 | 2 | 3 | 3 | 8 | ▁▇▂▁▁ |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1 | 1 | 1 | 3 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1.00 | 6.50 | 1.60 | 2 | 5 | 6 | 7 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1.00 | 0.60 | 0.64 | 0 | 0 | 1 | 1 | 3 | ▇▇▁▁▁ |
| GarageYrBlt | 66 | 0.94 | 1978.72 | 24.60 | 1900 | 1962 | 1980 | 2002 | 2010 | ▁▁▃▅▇ |
| GarageCars | 0 | 1.00 | 1.78 | 0.75 | 0 | 1 | 2 | 2 | 4 | ▁▃▇▂▁ |
| GarageArea | 0 | 1.00 | 474.49 | 215.58 | 0 | 336 | 480 | 576 | 1418 | ▂▇▃▁▁ |
| WoodDeckSF | 0 | 1.00 | 94.71 | 125.46 | 0 | 0 | 0 | 168 | 857 | ▇▂▁▁▁ |
| OpenPorchSF | 0 | 1.00 | 45.97 | 65.11 | 0 | 0 | 25 | 66 | 523 | ▇▁▁▁▁ |
| EnclosedPorch | 0 | 1.00 | 21.83 | 59.62 | 0 | 0 | 0 | 0 | 386 | ▇▁▁▁▁ |
| X3SsnPorch | 0 | 1.00 | 3.27 | 27.54 | 0 | 0 | 0 | 0 | 407 | ▇▁▁▁▁ |
| ScreenPorch | 0 | 1.00 | 14.99 | 54.43 | 0 | 0 | 0 | 0 | 440 | ▇▁▁▁▁ |
| PoolArea | 0 | 1.00 | 2.53 | 39.13 | 0 | 0 | 0 | 0 | 738 | ▇▁▁▁▁ |
| MiscVal | 0 | 1.00 | 32.30 | 304.99 | 0 | 0 | 0 | 0 | 8300 | ▇▁▁▁▁ |
| MoSold | 0 | 1.00 | 6.28 | 2.73 | 1 | 4 | 6 | 8 | 12 | ▃▆▇▃▃ |
| YrSold | 0 | 1.00 | 2007.81 | 1.33 | 2006 | 2007 | 2008 | 2009 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1.00 | 179754.74 | 76193.47 | 34900 | 130000 | 163000 | 214000 | 625000 | ▇▇▂▁▁ |
We can see that only 3 variables contain missing values, these are LotFrontage, MasVnrArea, GarageYrBlt.
Most of the observations have a value for LotFrontage which is the variable with more NA values, this is a high completion rate and therefore we will not remove it.
Finally, to solve these missing values we will use imputation by the median. The reason behind this is that most of the data is complete and we don’t think that doing any fancy imputation will drastically improve any future models in this case.
# Replace NA values with the median of each column
numeric_data <- numeric_data %>%
mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Number of rows with NA values
nrow(numeric_data %>% filter(if_any(everything(), is.na)))
## [1] 0
Now that the NA values are solved for numeric data, we will have to check the categorical data.
skim(categorical_data)
| Name | categorical_data |
| Number of rows | 1169 |
| Number of columns | 39 |
| _______________________ | |
| Column type frequency: | |
| character | 39 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
| Street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| LotShape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| LandContour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| Utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| LotConfig | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| LandSlope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| Neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
| Condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| Condition2 | 0 | 1.00 | 4 | 6 | 0 | 7 | 0 |
| BldgType | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| HouseStyle | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| RoofStyle | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| RoofMatl | 0 | 1.00 | 5 | 7 | 0 | 7 | 0 |
| Exterior1st | 0 | 1.00 | 5 | 7 | 0 | 14 | 0 |
| Exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
| MasVnrType | 4 | 1.00 | 4 | 7 | 0 | 4 | 0 |
| ExterQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| ExterCond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| Foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| BsmtQual | 30 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtCond | 30 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtExposure | 31 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtFinType1 | 30 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| BsmtFinType2 | 31 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| Heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| HeatingQC | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| CentralAir | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| Electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| KitchenQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| Functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| FireplaceQu | 561 | 0.52 | 2 | 2 | 0 | 5 | 0 |
| GarageType | 66 | 0.94 | 6 | 7 | 0 | 6 | 0 |
| GarageFinish | 66 | 0.94 | 3 | 3 | 0 | 3 | 0 |
| GarageQual | 66 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| GarageCond | 66 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| PavedDrive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| SaleType | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
| SaleCondition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
The variables with missing values are MasVnrType, all the variables referring to Bsmt which refers to the basement, one NA value in Electrical. Garage variables also have NA values. FirePlaceQu is the variable with most missing values.
Some of the NA values in the basement, garage and FirePlaceQu refer to that the house has none of these elements (as specified in the dataset webpage) , therefore we will change the NA values to None.
# Choosing the columns to change from NA to None
na_columns <- c("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1","BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual","GarageCond")
categorical_data[na_columns][is.na(categorical_data[na_columns])] <- "None"
As for MasVnrType there are only 8 missing values, Electrical has only one missing value. Due to this we will use KNN. This finds the most similar observations based on the other variables and imputes the missing value accordingly. With this small number of NAs we reduce the risk of introducing bias or overfitting in the imputation.
# Replace NA values with the mode of each column
imputed_categorical_data <- VIM::kNN(categorical_data,
variable = c("MasVnrType","Electrical"),
k = 10)
# KNN creates a new dataset with extra columns which indicate the rows that have been changed, therefore we will have to change the imputed rows for our original rows
categorical_data$MasVnrType <- imputed_categorical_data$MasVnrType
categorical_data$Electrical <- imputed_categorical_data$Electrical
# Number of rows with NA values
nrow(categorical_data %>% filter(if_any(everything(), is.na)))
## [1] 0
Now that there are no NA values we can merge the training data back together.
# Combining both datasets and obtaining a general view of it
training <- cbind(numeric_data, categorical_data)
skim(training)
| Name | training |
| Number of rows | 1169 |
| Number of columns | 76 |
| _______________________ | |
| Column type frequency: | |
| character | 39 |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1 | 2 | 7 | 0 | 5 | 0 |
| Street | 0 | 1 | 4 | 4 | 0 | 2 | 0 |
| LotShape | 0 | 1 | 3 | 3 | 0 | 4 | 0 |
| LandContour | 0 | 1 | 3 | 3 | 0 | 4 | 0 |
| Utilities | 0 | 1 | 6 | 6 | 0 | 2 | 0 |
| LotConfig | 0 | 1 | 3 | 7 | 0 | 5 | 0 |
| LandSlope | 0 | 1 | 3 | 3 | 0 | 3 | 0 |
| Neighborhood | 0 | 1 | 5 | 7 | 0 | 25 | 0 |
| Condition1 | 0 | 1 | 4 | 6 | 0 | 9 | 0 |
| Condition2 | 0 | 1 | 4 | 6 | 0 | 7 | 0 |
| BldgType | 0 | 1 | 4 | 6 | 0 | 5 | 0 |
| HouseStyle | 0 | 1 | 4 | 6 | 0 | 8 | 0 |
| RoofStyle | 0 | 1 | 3 | 7 | 0 | 6 | 0 |
| RoofMatl | 0 | 1 | 5 | 7 | 0 | 7 | 0 |
| Exterior1st | 0 | 1 | 5 | 7 | 0 | 14 | 0 |
| Exterior2nd | 0 | 1 | 5 | 7 | 0 | 15 | 0 |
| MasVnrType | 0 | 1 | 4 | 7 | 0 | 4 | 0 |
| ExterQual | 0 | 1 | 2 | 2 | 0 | 4 | 0 |
| ExterCond | 0 | 1 | 2 | 2 | 0 | 5 | 0 |
| Foundation | 0 | 1 | 4 | 6 | 0 | 6 | 0 |
| BsmtQual | 0 | 1 | 2 | 4 | 0 | 5 | 0 |
| BsmtCond | 0 | 1 | 2 | 4 | 0 | 5 | 0 |
| BsmtExposure | 0 | 1 | 2 | 4 | 0 | 5 | 0 |
| BsmtFinType1 | 0 | 1 | 3 | 4 | 0 | 7 | 0 |
| BsmtFinType2 | 0 | 1 | 3 | 4 | 0 | 7 | 0 |
| Heating | 0 | 1 | 4 | 5 | 0 | 6 | 0 |
| HeatingQC | 0 | 1 | 2 | 2 | 0 | 5 | 0 |
| CentralAir | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| Electrical | 0 | 1 | 3 | 5 | 0 | 5 | 0 |
| KitchenQual | 0 | 1 | 2 | 2 | 0 | 4 | 0 |
| Functional | 0 | 1 | 3 | 4 | 0 | 7 | 0 |
| FireplaceQu | 0 | 1 | 2 | 4 | 0 | 6 | 0 |
| GarageType | 0 | 1 | 4 | 7 | 0 | 7 | 0 |
| GarageFinish | 0 | 1 | 3 | 4 | 0 | 4 | 0 |
| GarageQual | 0 | 1 | 2 | 4 | 0 | 6 | 0 |
| GarageCond | 0 | 1 | 2 | 4 | 0 | 6 | 0 |
| PavedDrive | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| SaleType | 0 | 1 | 2 | 5 | 0 | 9 | 0 |
| SaleCondition | 0 | 1 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| MSSubClass | 0 | 1 | 56.66 | 41.84 | 20 | 20 | 50 | 70 | 190 | ▇▅▂▁▁ |
| LotFrontage | 0 | 1 | 69.80 | 22.10 | 21 | 60 | 69 | 78 | 313 | ▇▂▁▁▁ |
| LotArea | 0 | 1 | 10504.23 | 9024.91 | 1300 | 7590 | 9480 | 11700 | 164660 | ▇▁▁▁▁ |
| OverallQual | 0 | 1 | 6.09 | 1.37 | 1 | 5 | 6 | 7 | 10 | ▁▂▇▅▁ |
| OverallCond | 0 | 1 | 5.55 | 1.11 | 1 | 5 | 5 | 6 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1 | 1971.32 | 30.23 | 1872 | 1954 | 1973 | 2000 | 2009 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1 | 1984.87 | 20.67 | 1950 | 1966 | 1994 | 2004 | 2010 | ▅▂▂▃▇ |
| MasVnrArea | 0 | 1 | 103.77 | 177.12 | 0 | 0 | 0 | 169 | 1378 | ▇▁▁▁▁ |
| BsmtFinSF1 | 0 | 1 | 440.40 | 455.63 | 0 | 0 | 377 | 712 | 5644 | ▇▁▁▁▁ |
| BsmtFinSF2 | 0 | 1 | 48.62 | 168.30 | 0 | 0 | 0 | 0 | 1474 | ▇▁▁▁▁ |
| BsmtUnfSF | 0 | 1 | 568.88 | 440.47 | 0 | 225 | 479 | 806 | 2153 | ▇▆▂▁▁ |
| TotalBsmtSF | 0 | 1 | 1057.90 | 439.51 | 0 | 793 | 992 | 1304 | 6110 | ▇▃▁▁▁ |
| X1stFlrSF | 0 | 1 | 1161.52 | 381.50 | 334 | 882 | 1086 | 1394 | 4692 | ▇▅▁▁▁ |
| X2ndFlrSF | 0 | 1 | 340.03 | 428.95 | 0 | 0 | 0 | 728 | 1796 | ▇▂▂▁▁ |
| LowQualFinSF | 0 | 1 | 6.23 | 50.03 | 0 | 0 | 0 | 0 | 572 | ▇▁▁▁▁ |
| GrLivArea | 0 | 1 | 1507.77 | 503.07 | 334 | 1138 | 1466 | 1768 | 5642 | ▇▇▁▁▁ |
| BsmtFullBath | 0 | 1 | 0.43 | 0.52 | 0 | 0 | 0 | 1 | 3 | ▇▆▁▁▁ |
| BsmtHalfBath | 0 | 1 | 0.05 | 0.23 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| FullBath | 0 | 1 | 1.55 | 0.54 | 0 | 1 | 2 | 2 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1 | 0.38 | 0.50 | 0 | 0 | 0 | 1 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1 | 2.86 | 0.83 | 0 | 2 | 3 | 3 | 8 | ▁▇▂▁▁ |
| KitchenAbvGr | 0 | 1 | 1.05 | 0.22 | 0 | 1 | 1 | 1 | 3 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1 | 6.50 | 1.60 | 2 | 5 | 6 | 7 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1 | 0.60 | 0.64 | 0 | 0 | 1 | 1 | 3 | ▇▇▁▁▁ |
| GarageYrBlt | 0 | 1 | 1978.79 | 23.89 | 1900 | 1963 | 1980 | 2001 | 2010 | ▁▁▃▅▇ |
| GarageCars | 0 | 1 | 1.78 | 0.75 | 0 | 1 | 2 | 2 | 4 | ▁▃▇▂▁ |
| GarageArea | 0 | 1 | 474.49 | 215.58 | 0 | 336 | 480 | 576 | 1418 | ▂▇▃▁▁ |
| WoodDeckSF | 0 | 1 | 94.71 | 125.46 | 0 | 0 | 0 | 168 | 857 | ▇▂▁▁▁ |
| OpenPorchSF | 0 | 1 | 45.97 | 65.11 | 0 | 0 | 25 | 66 | 523 | ▇▁▁▁▁ |
| EnclosedPorch | 0 | 1 | 21.83 | 59.62 | 0 | 0 | 0 | 0 | 386 | ▇▁▁▁▁ |
| X3SsnPorch | 0 | 1 | 3.27 | 27.54 | 0 | 0 | 0 | 0 | 407 | ▇▁▁▁▁ |
| ScreenPorch | 0 | 1 | 14.99 | 54.43 | 0 | 0 | 0 | 0 | 440 | ▇▁▁▁▁ |
| PoolArea | 0 | 1 | 2.53 | 39.13 | 0 | 0 | 0 | 0 | 738 | ▇▁▁▁▁ |
| MiscVal | 0 | 1 | 32.30 | 304.99 | 0 | 0 | 0 | 0 | 8300 | ▇▁▁▁▁ |
| MoSold | 0 | 1 | 6.28 | 2.73 | 1 | 4 | 6 | 8 | 12 | ▃▆▇▃▃ |
| YrSold | 0 | 1 | 2007.81 | 1.33 | 2006 | 2007 | 2008 | 2009 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1 | 179754.74 | 76193.47 | 34900 | 130000 | 163000 | 214000 | 625000 | ▇▇▂▁▁ |
We will deal with outliers later as they are a bit complicated to deal with in this dataset due to the distribution of the variables, this distribution can be seen in the graph we made below. If we do the scaling to remove outliers we will not be able to correctly visualize the data. Moreover, visualizing our data could be crucial for potentially finding out some outliers.
We want to generate some insights to obtain information about the Sale Price and how it relates to the remaining variables.
ggplot(training, aes(x = SalePrice, y = GrLivArea))+
geom_point(aes(color = OverallQual))+
scale_x_continuous(labels = scales::comma) +
facet_wrap(~ HouseStyle)+
labs(
title="Relation of Area, Quality and House Style in relation to the Sale Price",
x = "Sale Price",
y = "Area",
color = "Quality"
)+
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
This first graph compares the square feet area of the House in relation to the Sale Price. This is differentiated by color with respect to the quality of the house and all the houses are differentiated by the different dwelling styles which refer mostly to the different floors the house has.
This graph clearly shows a positive correlation between the Sale Price and the Area, which makes sense, the bigger the house the pricier the house. Both of these variables seem to also be correlated with the Overall Quality as usually more expensive houses or bigger houses have really good quality.
In relation to the dwelling, most of the houses are either 1 story or 2 story, we see that the price, area and quality is not really affected by the type of the dwelling, although it helps us to differentiate the data and we can see some points for example in the 2 story there is a house with really high area and moderate price which could be an outlier.
We will now compare the Sale Price respect to the year it was built and differentiating by the Type of Dwelling.
ggplot(training, aes(x = YearBuilt, y = SalePrice, color = BldgType)) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Trend of SalePrice Over YearBuilt by BldgType",
x = "Year Built",
y = "Sale Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
We can see that throughout the years the single family house (1Fam) was the most common house sold. Up until the 1980s two family homes (2fmCon) were also sold and sometimes reached higher sale prices than the single family, but from then on it seems they stopped being built (or we not have this info in our dataset). For the houses built from the 1980s onwards there was an increase in the different types and its price at which the houses were sold.
With respect to the price we can see a high increase in all the different housing categories when the 2000s are reached which could be due to inflation, an overall economic growth and a growth of urban population.
Moreover, there are also sudden decreases in price for example for Townhouse End Units (TwnhsE) which could be due to a lower demand or different and more affordable housing options.
To conclude our EDA we will do a correlation plot for the numeric data.
cor_matrix = cor(numeric_data)
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 90)
We observe that some columns like OverallQual (overall quality of the house) have a really strong positive correlation with the predictor, SalePrice. Moreover, many of them have little to no correlation with the predictor, so we will have to deal with them later since they may be noise affecting future models. Finally there are very few negative correlations between columns which is interesting.
We will now deal with potential outliers, since as we discussed before, handling them properly can improve our predictions. Firstly, we will visualize the distribution of each of the variables separating both numerical and categorical data again.
for (col_name in names(numeric_data)) {
hist(
numeric_data[[col_name]],
main = paste("Histogram of", col_name),
xlab = col_name,
col = "skyblue",
border = "white"
)
}
As we can see, there is a lot of variability in the data, some of them are left-skewed while some right-skewed. Also there are some discrete variables like FullBath that represents the number of full baths in the house. Before we can handle the outliers we will have to scale and transform the data.
Because of the wide variety of distributions in the data we cannot make every transformation by hand. Instead we will use a library that, based on mathematical values, applies the best transformation possible to the column and then scales it, so that it is as close to a Gaussian distribution as possible. The library had some problems with the transformation of some columns that had many 0 values, so we separated them and scaled them separately.
# Problematic columns (because of errors with bestNormalize)
problematic_cols <- c("LowQualFinSF", "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch")
# Separating the problematic columns
problematic_data <- numeric_data[, problematic_cols]
new_numeric_data <- numeric_data[, !(names(numeric_data) %in% problematic_cols)]
# Scaling the remaining numeric data
# Create an empty list to store bestNormalize objects
training_bestNormalize <- list()
# Create a data frame to store transformed data
scaled_num_data <- data.frame(matrix(nrow = nrow(new_numeric_data), ncol = 0)) # Empty dataframe
# Apply bestNormalize to each column and store the objects
for (col in names(new_numeric_data)) {
best_norm <- bestNormalize(new_numeric_data[[col]], allow_orderNorm = TRUE)
training_bestNormalize[[col]] <- best_norm # Save the transformation object
scaled_num_data[[col]] <- best_norm$x.t # Save the transformed column
}
# Scaling the problematic columns separately
scaled_problematic <- as.data.frame(scale(problematic_data))
# Combining the scaled dataframes
final_scaled_data <- cbind(scaled_num_data, scaled_problematic)
# Plotting again the distributions
for (col_name in names(final_scaled_data)) {
hist(
final_scaled_data[[col_name]],
main = paste("Histogram of", col_name),
xlab = col_name,
col = "skyblue",
border = "white"
)
}
Now we can see that most of the data resembles a \(N(\mu=0, \sigma=1)\) so the scaling has been done correctly and we can fully focus on outliers.
For the detection of outliers we will use three algorithms following the logic of an ensemble, in order to have a more robust prediction. We will only use numerical columns, since clustering algorithms do not handle categorical variables properly without transformations.
The first algorithm we will use is DBSCAN. It is a clustering algorithm that identifies clusters based on the density of data points. Points are classified into three categories: core points, border points and noise, based on this:
\[N_\epsilon(p) = \{q \in D \mid d(p,q) \leq \epsilon \}\]
Where \(N_\epsilon\) is the \(\epsilon\)-neighborhood for a point. With this, a point \(p\) is a core point if:
\[|N_\epsilon(p)| \geq minPts\]
Where \(minPts\) is a hyper-parameter, the number of points for a region to be considered dense. Clusters are then formed by expanding the \(\epsilon\)-neighborhood of core points iteratively. Those observation not in this neighborhood will be the oultiers.
With this explanation we see the importance of picking the right \(\epsilon\) value (eps) and the number of neighbors (minPts). Because of this we will calculate the k-distances plot and use the elbow method to determine the eps. As minPts we will use 4, since it is a reasonable number in our case.
# Calculate k-distances
kNNdistplot(final_scaled_data, k = 10)
abline(h = 6.5, col = "red", lty = 2)
We see that there is an elbow at 6.5, so this is the value we will use for eps. With this we can finally train a DBSCAN model and plot our predictions for outliers.
# Training the model
model <- dbscan(final_scaled_data, eps = 6.5, minPts = 10)
# Performing pca for plotting
pca <- prcomp(final_scaled_data, center = TRUE)
# Get the first two principal components
pca_data <- as.data.frame(pca$x)
# Add cluster labels to the PCA data
pca_data$dbscan <- model$cluster
# Plot Clusters
ggplot(pca_data, aes(x = PC1, y = PC2, color = as.factor(dbscan))) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("red", RColorBrewer::brewer.pal(8, "Set2"))) +
labs(title = "DBSCAN Clustering with Outliers",
subtitle = "Red points indicate outliers (cluster 0)",
color = "Cluster") +
theme_minimal()
# Calculate the total number of outliers detected
sum(pca_data$dbscan == 0)
## [1] 47
The second method for outlier detection we will use is Isolation Forests. They are a tree-based unsupervised learning method specific for anomally detection. They use the idea that outliers are usually more ‘isolated’ than normal observation. Each tree in the forest splits the data by selecting a random feature and a split value.
The path length \(h(x)\) of a point \(x\) is the number of splits required to isolate the observation. Since outliers are far from dense clusters, usually they have shorter path lengths. With this we calculate the anomaly score:
\[ s(x) = 2^{-\frac{H(x)}{c(n)}} \]
Where \(c(n)\) is the average path length for a normal data point in a dataset of \(n\) observations, and \(H(x)\) is the average path legth of the point. The closest this anomaly score is to 1 the more likely it is an anomaly. Therefore we will look at the anomaly score to see where the outliers lie.
# Setting ndim=1 to only take into account one feature at each split
# for simplicity, and ntrees as 100 to make an ensemble for more robust
# predictions
isolation_model <- isolation.forest(final_scaled_data, ndim = 1, ntrees = 100)
outlier_score <- predict(isolation_model, final_scaled_data, type = "score")
# Visualizing the different values
hist(outlier_score, breaks = 20, main = "Distribution of Anomaly Scores")
We can see that most of the observations have an anomaly score between 0 and 0.5, so according to this model a value of around 0.5 should be the boundary between outliers and the rest of points. We can see that there is one observation far away from the rest while there are some observations between 0.52 and 0.6 which are few but could also be outliers.
We will set the inlier proportion to 0.95, meaning that we will assume that 5% of our data are outliers. With this and the previous model we trained, we can predict outliers.
# Stablishing a threshold for the highest values
threshold <- quantile(outlier_score, 0.95)
outliers <- which(outlier_score >= threshold)
print(unique(outliers))
## [1] 6 46 51 79 106 152 159 162 203 205 271 284 354 358 401
## [16] 402 414 429 432 470 478 508 509 520 534 563 565 598 645 650
## [31] 665 667 690 716 767 797 830 857 888 907 942 947 956 961 978
## [46] 990 992 1019 1021 1023 1046 1074 1085 1086 1094 1104 1115 1145 1165
pca_data$iso <- ifelse(1:nrow(pca_data) %in% outliers, 0, 1)
The third model we will use for outlier detection is OCSVM (One-Class Support Vector Machine). It is an unsupervised learning algorithm that tries to separate the anomalies from the rest of the data with a hyperplane (a line in 2D) that maximizes the distance between the outliers and the other points.
Mathematically, OCSVM maps points \(x_i\) into higher-dimensional feature space using a kernel function \(\phi(x)\). This is done to capture non-linear relationships (similar to what we did in Kernel K-Means). It then finds a decision function:
\[ f(x) = \langle w, \phi(x) \rangle - \rho \]
Where \(w\) is the weight vector of the hyperplane, \(\rho\) is the offset and \(\langle w, \phi(x) \rangle\) is the inner product in the feature space. Then the optimization proble minimizes the modulus of \(w\) while allowing a small fraction \(\upsilon\) (this controls the proportion of outliers) of points to lie outside the boudary. If \(f(x_i) < 0\) then \(x_i\) is classified as an outlier.
# Fit the One-Class SVM
ocsvm_model <- svm(
final_scaled_data,
type = "one-classification",
kernel = "radial", # RBF kernel
gamma = 1 / ncol(data), # Default gamma (1 / num_features)
nu = 0.05 # Proportion of outliers expected
)
# Get predictions
predictions <- predict(ocsvm_model, final_scaled_data)
# Add the numeric predictions as a column to the PCA data
pca_data$ocsvm <- ifelse(predictions, 1, 0)
# Plot Clusters
ggplot(pca_data, aes(x = PC1, y = PC2, color = as.factor(ocsvm))) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("red", RColorBrewer::brewer.pal(8, "Set2"))) +
labs(title = "OCSVM Outliers Detection",
subtitle = "Red points indicate outliers",
color = "Outlier") +
theme_minimal()
sum(pca_data$ocsvm == F)
## [1] 59
We keep using the dataset based on the PCA we have done (for plotting mainly), this dataset contains three extra columns which correspond to each of the outlier detection models we have made. In each of these columns, an observation can take a value of either 0 or 1, if it has a 0, it is considered an outlier.
Therefore, we created another column that decided if an observatoin had at least two 0’s, in other words, at least two models consider the observation to be an outlier, this point will be eliminated from our dataset. As different models assure that this points are atypical points we can discard them from our dataset confidently.
# Creating the final column based on the values
pca_data$outliers <- ifelse(rowSums(pca_data[, c("iso", "ocsvm", "dbscan")]) >= 2, 1, 0)
sum(pca_data$outliers == 0)
## [1] 50
# Plot Clusters
ggplot(pca_data, aes(x = PC1, y = PC2, color = as.factor(outliers), shape=as.factor(iso))) +
geom_point(size = 3, alpha = 0.65) +
scale_color_manual(values = c("red", RColorBrewer::brewer.pal(8, "Set2"))) +
labs(title = "Ensemble Outliers Detection",
subtitle = "Red points indicate outliers",
color = "Outlier",
shape = "Isolation Forest Prediction") +
theme_minimal()
The previous graph compares the ensemble’s prediction with the isolation forest, where the distinction by color is the outlier detection done by the ensemble and the distinction by shape is done by the Isolation Forest. We can see that most of the outliers are predicted the same way, although there are some discrepancies. These are mainly between the Isolation Forest and the other models, since their approach is quite different.
With this, we will now proceed to eliminate the outliers.
# Obtaining the indices of the outliers
outlier_indices <- which(pca_data$outliers == 0)
# Removing this indices from the dataset
final_scaled_data <- final_scaled_data[-outlier_indices,]
categorical_data <- categorical_data[-outlier_indices,]
For our categorical data we have around 39 variables and our numerical data has 37 variables. This yields up to 76 columns. This is a really high dimension and this could really affect our future analysis. Because of this, we will reduce our dataset to obtain the most important variables which affect our target variable sale price.
We will obtain the most important features with the help of RFE method. RFE or Recursive Feature Elimination builds many models based on the target variable. It will remove variables based on their importance calculated with the Gini index. With this reduced dataset it will create another model and again remove the least important variable. This process is repeated until the best value is found. After applying this process we will obtain an optimal subset of predictors which contribute the most to the target variable.
When using this method there are two hyperparameters to take into account, the type of model that will be created, which in our case due to that we have categorical data we will clearly use Random Forests. Apart from that, we can also select the number features that will be used in each of the models.
As we want to reduce our dimensionality based on the output of the RFE we will select an specific amount of variables which return a good value and do not make us loose a lot of information about the target variable .
# We will have to separate the predictors and target variable
num_predictors <- final_scaled_data[,-which(names(final_scaled_data) == "SalePrice")]
target_variable <- final_scaled_data$SalePrice
control <- rfeControl(functions = rfFuncs, method = "cv", number = 5)
# Numerical columns
rfe_num <- rfe(num_predictors, target_variable, sizes = seq(5, 35, by = 5), rfeControl = control)
print(rfe_num)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 5 0.4012 0.8335 0.2966 0.020884 0.018234 0.015132
## 10 0.3327 0.8859 0.2431 0.007313 0.005621 0.010872
## 15 0.3287 0.8882 0.2379 0.007185 0.004820 0.007574 *
## 20 0.3293 0.8888 0.2379 0.007489 0.003701 0.007542
## 25 0.3297 0.8888 0.2371 0.007435 0.004012 0.007596
## 30 0.3300 0.8884 0.2375 0.009204 0.005014 0.008448
## 35 0.3289 0.8896 0.2364 0.008933 0.004937 0.008459
## 36 0.3299 0.8890 0.2375 0.010054 0.005890 0.008816
##
## The top 5 variables (out of 15):
## GrLivArea, OverallQual, BsmtFinSF1, TotalBsmtSF, YearBuilt
# Categorical columns
rfe_cat <- rfe(categorical_data, target_variable, sizes = seq(5, 15, by = 2), rfeControl = control)
print(rfe_cat)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 5 0.6427 0.6080 0.4959 0.05820 0.07963 0.04226
## 7 0.5791 0.6690 0.4436 0.05236 0.05171 0.03859
## 9 0.5225 0.7184 0.3983 0.03426 0.05924 0.02289 *
## 11 0.5330 0.7226 0.4056 0.04990 0.04163 0.04331
## 13 0.5401 0.7224 0.4153 0.04645 0.04136 0.03839
## 15 0.5469 0.7262 0.4204 0.03196 0.03265 0.03154
## 39 0.5740 0.7282 0.4483 0.06758 0.03742 0.06774
##
## The top 5 variables (out of 9):
## Neighborhood, GarageType, FireplaceQu, BldgType, ExterQual
The first column variables, refers to the number of variables each of the models that have been created has. RMSE is the root mean squared error which is the average of the error between predicted and actual values. The lower this value is, the better. The R squared shows the proportion of the variance of the target variable, the closer to one, the better. For MAE (Mean Absolute Error), we want to obtain a low number. The RMSESD, which is the standard deviation of the RMSE, lower values indicate a grater performance. RsquaredSD, the standard deviation of R squared. Same as the previous variables, we want a low value. MAESD, the standard deviation of the MAE, same as before, the lower the better. Finally, the selected column shows an asterisk in the final chosen model.
In this case we will focus on the first three columns we want to minimize the number of variables which return a low prediction error and explain a high percentage of the variance of the sale price.
We can see that for numerical data the model for 15 variables is not the best result-wise but it returns a really good value despite it being a really big dimensionality reduction, therefore we will choose 15 variables.
Moreover, for categorical data, the lowest MSE and MAE is obtained by the model with 9 variables, it is also a decent value for the R squared based on the fact that it is only 9 variables.
full_scaled_data = cbind(final_scaled_data, categorical_data)
# Selecting 15 features for numerical data
selected_num_features <- predictors(rfe_num)[1:15]
print(selected_num_features)
## [1] "GrLivArea" "OverallQual" "BsmtFinSF1" "TotalBsmtSF" "YearBuilt"
## [6] "X1stFlrSF" "LotArea" "OverallCond" "GarageArea" "Fireplaces"
## [11] "X2ndFlrSF" "YearRemodAdd" "MSSubClass" "LotFrontage" "GarageCars"
# Selecting 9 features for categorical data
selected_cat_features <- predictors(rfe_cat)[1:9]
print(selected_cat_features)
## [1] "Neighborhood" "GarageType" "FireplaceQu" "BldgType" "ExterQual"
## [6] "BsmtQual" "MSZoning" "HouseStyle" "KitchenQual"
# Combining both results
selected_features <- c(selected_num_features, selected_cat_features)
# Obtaining our reduced dataset with the most important variables.
reduced_data <- full_scaled_data[,selected_features]
reduced_data$SalePrice <- target_variable
We reduced our numerical features to 15 variables and our categorical data to 9 variables. Going from 76 variables to 24, where we added the target variable.
Making a feature importance plot based on the RFE results, it will rank in both datasets how early each variable was selected and afterwards create a plot.
# Extract feature importance from RFE objects
num_importance <- rfe_num$variables
cat_importance <- rfe_cat$variables
selected_num_features2 <- num_importance$var[1:25]
print(selected_num_features2)
## [1] "GrLivArea" "OverallQual" "TotalBsmtSF" "BsmtFinSF1" "LotArea"
## [6] "X1stFlrSF" "YearBuilt" "Fireplaces" "OverallCond" "X2ndFlrSF"
## [11] "GarageArea" "MSSubClass" "YearRemodAdd" "BsmtUnfSF" "GarageCars"
## [16] "LotFrontage" "GarageYrBlt" "TotRmsAbvGrd" "BedroomAbvGr" "FullBath"
## [21] "OpenPorchSF" "MasVnrArea" "HalfBath" "BsmtFullBath" "WoodDeckSF"
# Combine feature importance
feature_importance <- rbind(
data.frame(Feature = num_importance$var, Importance = num_importance$Overall, Type = "Numerical"),
data.frame(Feature = cat_importance$var, Importance = cat_importance$Overall, Type = "Categorical")
)
# Ensure unique features by aggregating importance
feature_importance <- feature_importance %>%
group_by(Feature, Type) %>%
summarise(Importance = mean(Importance, na.rm = TRUE), .groups = "drop")
# Sort features by importance
feature_importance <- feature_importance[order(-feature_importance$Importance), ]
# Plot
ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance, fill = Type)) +
geom_bar(stat = "identity", color="black") +
coord_flip() +
labs(
title = "Feature Importance from RFE",
x = "Features",
y = "Importance"
) +
theme_minimal()
This resulted in a final dataset of 1118 observations and 25 variables which is a really big change with respect to what we previously had, and will improve our future analysis.
After having reduced the dimensionality of categorical features, we can proceed with encoding.
Most statistical and machine learning models require the predictors to be in some sort of numeric format. In this case, most of the algorithms and models that we will use later in our project need data to be numerical. There are many encoding methods that we could use to convert categorical variables into numerical ones, so we have to choose wisely.
data_enc <- reduced_data
data_not_enc <- reduced_data
# Convert all character columns to factors
data_not_enc[] <- lapply(data_not_enc, function(col) {
if (is.character(col)) as.factor(col) else col
})
skim(reduced_data)
| Name | reduced_data |
| Number of rows | 1119 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Neighborhood | 0 | 1 | 5 | 7 | 0 | 25 | 0 |
| GarageType | 0 | 1 | 4 | 7 | 0 | 7 | 0 |
| FireplaceQu | 0 | 1 | 2 | 4 | 0 | 6 | 0 |
| BldgType | 0 | 1 | 4 | 6 | 0 | 5 | 0 |
| ExterQual | 0 | 1 | 2 | 2 | 0 | 4 | 0 |
| BsmtQual | 0 | 1 | 2 | 4 | 0 | 5 | 0 |
| MSZoning | 0 | 1 | 2 | 7 | 0 | 5 | 0 |
| HouseStyle | 0 | 1 | 4 | 6 | 0 | 8 | 0 |
| KitchenQual | 0 | 1 | 2 | 2 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| GrLivArea | 0 | 1 | -0.02 | 0.95 | -2.86 | -0.68 | -0.02 | 0.64 | 2.60 | ▁▅▇▅▁ |
| OverallQual | 0 | 1 | -0.02 | 0.94 | -2.26 | -0.74 | -0.20 | 0.47 | 4.76 | ▁▇▁▁▁ |
| BsmtFinSF1 | 0 | 1 | 0.03 | 0.89 | -0.99 | -0.99 | 0.00 | 0.66 | 3.02 | ▇▆▃▁▁ |
| TotalBsmtSF | 0 | 1 | -0.01 | 0.98 | -2.23 | -0.68 | -0.01 | 0.66 | 3.02 | ▂▇▇▃▁ |
| YearBuilt | 0 | 1 | 0.01 | 0.98 | -3.33 | -0.65 | 0.01 | 0.71 | 2.57 | ▁▃▇▆▂ |
| X1stFlrSF | 0 | 1 | -0.01 | 0.97 | -2.75 | -0.68 | -0.01 | 0.65 | 2.86 | ▁▅▇▅▁ |
| LotArea | 0 | 1 | -0.03 | 0.97 | -3.33 | -0.68 | -0.02 | 0.63 | 2.86 | ▁▃▇▅▁ |
| OverallCond | 0 | 1 | -0.02 | 0.95 | -2.28 | -0.50 | -0.50 | 0.19 | 4.78 | ▁▇▂▁▁ |
| GarageArea | 0 | 1 | 0.00 | 0.96 | -1.91 | -0.68 | -0.01 | 0.65 | 3.02 | ▃▇▇▃▁ |
| Fireplaces | 0 | 1 | -0.01 | 1.00 | -1.02 | -1.02 | 0.82 | 0.82 | 2.17 | ▇▁▇▁▁ |
| X2ndFlrSF | 0 | 1 | -0.02 | 0.95 | -0.73 | -0.73 | -0.73 | 0.70 | 4.58 | ▇▃▁▁▁ |
| YearRemodAdd | 0 | 1 | 0.01 | 0.96 | -1.53 | -0.69 | 0.01 | 0.73 | 3.14 | ▆▇▇▂▁ |
| MSSubClass | 0 | 1 | -0.02 | 0.99 | -1.02 | -1.02 | 0.00 | 0.50 | 2.62 | ▇▆▃▁▁ |
| LotFrontage | 0 | 1 | -0.01 | 0.98 | -2.49 | -0.65 | 0.00 | 0.65 | 3.14 | ▂▆▇▃▁ |
| GarageCars | 0 | 1 | 0.01 | 0.96 | -3.29 | -0.71 | 0.36 | 0.36 | 1.87 | ▁▁▃▇▂ |
| SalePrice | 0 | 1 | -0.01 | 0.96 | -3.02 | -0.66 | -0.01 | 0.65 | 2.86 | ▁▃▇▅▁ |
# Get unique values for each categorical column
lapply(reduced_data[, sapply(reduced_data, is.character)], unique)
## $Neighborhood
## [1] "CollgCr" "Veenker" "Crawfor" "NoRidge" "Somerst" "OldTown" "BrkSide"
## [8] "NridgHt" "NAmes" "Sawyer" "SawyerW" "IDOTRR" "MeadowV" "Edwards"
## [15] "Mitchel" "Gilbert" "ClearCr" "NWAmes" "NPkVill" "Timber" "StoneBr"
## [22] "Blmngtn" "BrDale" "SWISU" "Blueste"
##
## $GarageType
## [1] "Attchd" "Detchd" "BuiltIn" "CarPort" "None" "Basment" "2Types"
##
## $FireplaceQu
## [1] "None" "TA" "Gd" "Fa" "Ex" "Po"
##
## $BldgType
## [1] "1Fam" "2fmCon" "Duplex" "TwnhsE" "Twnhs"
##
## $ExterQual
## [1] "Gd" "TA" "Ex" "Fa"
##
## $BsmtQual
## [1] "Gd" "TA" "Ex" "None" "Fa"
##
## $MSZoning
## [1] "RL" "RM" "C (all)" "FV" "RH"
##
## $HouseStyle
## [1] "2Story" "1Story" "1.5Fin" "1.5Unf" "SFoyer" "SLvl" "2.5Fin" "2.5Unf"
##
## $KitchenQual
## [1] "Gd" "TA" "Ex" "Fa"
Firstly we can see that there are many variables that have a predetermined order, like ExterQual (Excellent, Good…). As there is a clear distinction between the values of the variables we will use label encoding, that assigns to each instance a number so that there is a scale.
data_enc$ExterQual <- as.integer(factor(data_enc$ExterQual, levels = c("Po", "Fa", "TA", "Gd", "Ex")))
data_enc$FireplaceQu <- as.integer(factor(data_enc$FireplaceQu, levels = c("None", "Po", "Fa", "TA", "Gd", "Ex")))
data_enc$BsmtQual <- as.integer(factor(data_enc$BsmtQual, levels = c("None", "Fa", "TA", "Gd", "Ex")))
data_enc$KitchenQual <- as.integer(factor(data_enc$KitchenQual, levels = c("Fa", "TA", "Gd", "Ex")))
There are 5 variables left that have not any order in their categories, for them we have two options. Firstly, we could use one-hot encoding (creating columns with 0s and 1s depending on the unique values of each column), but this is not feasable for most variables, since they have many unique values, and would increase the number of columns highly.
Instead, we will use target encoding, where each unique value is replaced with the mean SalePrice of it, since we know from the feature extraction that these variables are important. This aproach has a risk, which is data leakage, since these columns may “give too much information” about the target value to a future model. For this we will use cross validation, in order not to have this data leakage.
First we will create a function that will carry out this target encoding.
target_encode_cv_with_means <- function(data, target_col, cat_columns, n_folds = 5) {
folds <- createFolds(data[[target_col]], k = n_folds) # Create folds
# Initialize the encoded dataset
data_encoded <- data
# Initialize a list to store overall means for each categorical column
overall_means <- list()
# Loop through each categorical column
for (col in cat_columns) {
# Add a new column for the encoded values
data_encoded[[paste0(col, '_enc')]] <- NA
# Process each fold
for (fold in 1:n_folds) {
# Split into training and validation folds
val_indices <- folds[[fold]]
train_fold <- data[-val_indices, ]
val_fold <- data[val_indices, ]
# Compute the target mean for each category in the training fold
means <- train_fold %>%
group_by(.data[[col]]) %>%
summarize(mean_target = mean(.data[[target_col]], na.rm = TRUE), .groups = "drop")
# Merge the means with the validation fold
val_fold <- val_fold %>%
left_join(means, by = col) %>%
mutate(mean_target = ifelse(is.na(mean_target), mean(train_fold[[target_col]], na.rm = TRUE), mean_target))
# Assign the encoded values to the appropriate rows in the dataset
data_encoded[val_indices, paste0(col, '_enc')] <- val_fold$mean_target
}
# After cross-validation, compute overall mean encoding for this column
overall_means[[col]] <- data %>%
group_by(.data[[col]]) %>%
summarize(mean_target = mean(.data[[target_col]], na.rm = TRUE), .groups = "drop")
}
return(list(encoded_data = data_encoded, overall_means = overall_means))
}
We now have the code that encodes our desired columns based on target encoding, therefore we will now apply it to our dataset and finish with the preprocessing.
# Remaining variables
categorical_columns <- c("HouseStyle", "MSZoning", "Neighborhood", "BldgType", "GarageType")
# Applying the function
result <- target_encode_cv_with_means(data_enc, target_col = "SalePrice", cat_columns = categorical_columns)
df_encoded <- result$encoded_data
training_target_means <- result$overall_means
# Switching the variables
data_enc$HouseStyle <- df_encoded$HouseStyle_enc
data_enc$Neighborhood <- df_encoded$Neighborhood_enc
data_enc$MSZoning <- df_encoded$MSZoning_enc
data_enc$BldgType <- df_encoded$BldgType_enc
data_enc$GarageType <- df_encoded$GarageType_enc
# Scaling the data because of the encodings
data_enc[, (ncol(data_enc) - 10):(ncol(data_enc) - 1)] <- scale(data_enc[, (ncol(data_enc) - 10):(ncol(data_enc) - 1)])
# Obtaining a general view of our final encoded dataset
skim(data_enc)
| Name | data_enc |
| Number of rows | 1119 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| numeric | 25 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| GrLivArea | 0 | 1 | -0.02 | 0.95 | -2.86 | -0.68 | -0.02 | 0.64 | 2.60 | ▁▅▇▅▁ |
| OverallQual | 0 | 1 | -0.02 | 0.94 | -2.26 | -0.74 | -0.20 | 0.47 | 4.76 | ▁▇▁▁▁ |
| BsmtFinSF1 | 0 | 1 | 0.03 | 0.89 | -0.99 | -0.99 | 0.00 | 0.66 | 3.02 | ▇▆▃▁▁ |
| TotalBsmtSF | 0 | 1 | -0.01 | 0.98 | -2.23 | -0.68 | -0.01 | 0.66 | 3.02 | ▂▇▇▃▁ |
| YearBuilt | 0 | 1 | 0.01 | 0.98 | -3.33 | -0.65 | 0.01 | 0.71 | 2.57 | ▁▃▇▆▂ |
| X1stFlrSF | 0 | 1 | -0.01 | 0.97 | -2.75 | -0.68 | -0.01 | 0.65 | 2.86 | ▁▅▇▅▁ |
| LotArea | 0 | 1 | -0.03 | 0.97 | -3.33 | -0.68 | -0.02 | 0.63 | 2.86 | ▁▃▇▅▁ |
| OverallCond | 0 | 1 | -0.02 | 0.95 | -2.28 | -0.50 | -0.50 | 0.19 | 4.78 | ▁▇▂▁▁ |
| GarageArea | 0 | 1 | 0.00 | 0.96 | -1.91 | -0.68 | -0.01 | 0.65 | 3.02 | ▃▇▇▃▁ |
| Fireplaces | 0 | 1 | -0.01 | 1.00 | -1.02 | -1.02 | 0.82 | 0.82 | 2.17 | ▇▁▇▁▁ |
| X2ndFlrSF | 0 | 1 | -0.02 | 0.95 | -0.73 | -0.73 | -0.73 | 0.70 | 4.58 | ▇▃▁▁▁ |
| YearRemodAdd | 0 | 1 | 0.01 | 0.96 | -1.53 | -0.69 | 0.01 | 0.73 | 3.14 | ▆▇▇▂▁ |
| MSSubClass | 0 | 1 | -0.02 | 0.99 | -1.02 | -1.02 | 0.00 | 0.50 | 2.62 | ▇▆▃▁▁ |
| LotFrontage | 0 | 1 | -0.01 | 0.98 | -2.49 | -0.65 | 0.00 | 0.65 | 3.14 | ▂▆▇▃▁ |
| GarageCars | 0 | 1 | 0.00 | 1.00 | -3.46 | -0.76 | 0.36 | 0.36 | 1.94 | ▁▁▃▇▂ |
| Neighborhood | 0 | 1 | 0.00 | 1.00 | -1.86 | -0.79 | 0.25 | 0.56 | 2.04 | ▆▇▅▇▂ |
| GarageType | 0 | 1 | 0.00 | 1.00 | -2.38 | -1.16 | 0.60 | 0.62 | 1.62 | ▁▃▁▇▁ |
| FireplaceQu | 0 | 1 | 0.00 | 1.00 | -0.99 | -0.99 | 0.12 | 1.24 | 1.79 | ▇▁▁▃▅ |
| BldgType | 0 | 1 | 0.00 | 1.00 | -4.80 | 0.23 | 0.26 | 0.28 | 0.84 | ▁▁▁▁▇ |
| ExterQual | 0 | 1 | 0.00 | 1.00 | -2.50 | -0.70 | -0.70 | 1.11 | 2.91 | ▁▇▁▅▁ |
| BsmtQual | 0 | 1 | 0.00 | 1.00 | -3.24 | -0.65 | 0.64 | 0.64 | 1.93 | ▁▁▇▇▂ |
| MSZoning | 0 | 1 | 0.00 | 1.00 | -5.52 | 0.36 | 0.37 | 0.38 | 1.45 | ▁▁▂▁▇ |
| HouseStyle | 0 | 1 | 0.00 | 1.00 | -3.77 | -0.22 | -0.13 | 1.16 | 1.20 | ▁▂▁▇▅ |
| KitchenQual | 0 | 1 | 0.00 | 1.00 | -2.32 | -0.78 | -0.78 | 0.77 | 2.31 | ▁▇▁▆▁ |
| SalePrice | 0 | 1 | -0.01 | 0.96 | -3.02 | -0.66 | -0.01 | 0.65 | 2.86 | ▁▃▇▅▁ |
After all of the preprocessing, it is interesting to do again another correlation plot to see how our data has changed after the preprocessing and to perhaps identify some mistakes we did.
cor_matrix = cor(data_enc)
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 90)
We now can see that most of the variables have a strong positive relation with the predictor, with this we are sure that the feature extraction we did was successful. Furthermore, after the encoding we see that many variables that were categorical have a strong correlation with the SalePrice, such as Neighborhood. This tells us that doing the encoding to handle categorical variables was worth it.
Many of the algorithms we will use are specific for classification, while our predictor is a numerical variable. To deal with this problem we will divide the price of each house into 5 categories (Very Low, Low, Medium, High, Very High). We will make this classification based on quantiles so that we have balanced categories, which will make it easier to train posterior models.
# Quantile-based bins
num_categories <- 5
training_saleprice_breaks <- quantile(data_enc$SalePrice, probs = seq(0, 1, length.out = num_categories + 1))
data_enc$SalePrice_Category <- cut(data_enc$SalePrice,
breaks = training_saleprice_breaks,
labels = paste0("Category_", 1:num_categories),
include.lowest = TRUE)
# Assign descriptive labels
labels <- c("Very Low", "Low", "Medium", "High", "Very High")
data_enc$SalePrice_Category <- cut(data_enc$SalePrice,
breaks = training_saleprice_breaks,
labels = labels,
include.lowest = TRUE)
# For not encoded data
# Quantile-based bins
num_categories <- 5
data_not_enc$SalePrice_Category <- cut(data_not_enc$SalePrice,
breaks = training_saleprice_breaks,
labels = paste0("Category_", 1:num_categories),
include.lowest = TRUE)
# Assign descriptive labels
labels <- c("Very Low", "Low", "Medium", "High", "Very High")
data_not_enc$SalePrice_Category <- cut(data_not_enc$SalePrice,
breaks = training_saleprice_breaks,
labels = labels,
include.lowest = TRUE)
# Plot categories
ggplot(data_enc, aes(x = SalePrice_Category, fill = SalePrice_Category)) +
geom_bar()
We will look at each algorithm starting with the simpler ones and building up to the more complex and precise ones. Along with each prediction, we will follow up with the interpretability of the results while trying to minimize the prediction error.
This interpretability may be hard to obtain as some models are ‘black boxes’ and it is complicated to see how they arrived at their conclusions. We will use SHAP values (SHapley Additive exPlanations) which measures how much each feature contributes to the prediction, we will use them for some models which return useful values and do not support categorical data as if not it is. Moreover, we will also use other graphs such as ROC curves or Feature Importance for a better understanding.
We will first look at decision trees, since they are a simple model. Although there are better algorithms for prediction, they are the foundations for more sophisticated methods. Furthermore, it is a white-box model, so it will help us understand how the predictions are done.
Mathematically, decision trees do a split in the data based on selected features. They do this split based on the Gini index, a metric that is similar to accuracy. The splits that yield the best Gini index are located at the root of the tree.
Since we are focusing on classification we will remove the target variable salePrice from the dataset. Furthermore, since decision trees can handle categorical variables, and for understanding purposes, we will use the dataset that is not encoded.
# Removing the penultimate columns which is where the Sale Price is found
data_not_enc <- data_not_enc[,-(ncol(data_not_enc)-1)]
data_enc <- data_enc[,-(ncol(data_enc)-1)]
Even though we know that the decision tree is not going to be our best model, we will do a 10-fold cross validation to check more or less the performance of this simple algorithm. In addition, we will not do any hyper-parameter tuning since as we said, our main focus with this algorithm is interpretability.
# For reproducibility purposes
set.seed(123)
# Setting up cross-validation
cv_folds <- trainControl(method = "cv", number = 10)
# Performing cross-validation
cv_model <- train(SalePrice_Category ~ ., data = data_not_enc, method = "rpart", trControl = cv_folds, tuneGrid = data.frame(cp = 0.01))
# Checking results
print(cv_model)
## CART
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5649319 0.4559885
##
## Tuning parameter 'cp' was held constant at a value of 0.01
# Saving the final model for visualization
final_dt <- cv_model$finalModel
We see a ~56% accuracy, which is not bad but there is room for improvement. Also we obtain a ~0.45 Kappa value, which indicates a moderate agreement between the model’s predictions and true labels.
Now, we can plot the decision tree so that we can understand how the predictions are done.
rpart.plot(final_dt, digits = 3)
The first split done is by OverallQual, which measure the overall quality of the house, with this we can see that houses with a higher quality usually are pricier, as expected. In the right part of the tree, we observe another split by OverallQual, so it indicates that houses with a very high overall quality are the most expensive ones. Another split is by GrLivArea, which measures the above ground area of the house, meaning that bigger houses are more expensive, which also makes sense. Finally, it splits by GarageType, with the houses that have a garage attached to home being more expensive.
In the other side of the decision tree, we observe a split with GarageArea, with observations with small garages being cheaper. We also see other splits, with BsmtFinSF1, with houses with smaller basements being less expensive. Finally we also see other splits, but all regarding the area of some part of the house, like the first floor.
With this we learn that the most important variables may be the overall quality of the house, as well as some other metrics related to the area of different parts of the house.
The next algorithm we will use is k-Nearest Neighbors. It is a supervised learning model that performs classification. Mathematically, the algorithm calculates, for each new data point, its distance to each point in the training data. We then define a hyperparameter \(k\), that selects the \(k\) closest data points to this new observation. The most common class of these \(k\) closest observations is then chosen as the predicted class.
Although this algorithm has some problems when dimensionality is high and it is a black-box model, we will use it to see how this types of algorithms perform in our data. This will help us understand whether the classes can be easily separable or not, as well as if more complex models are needed.
Before training the final algorithm we have to choose the value for \(k\), which is crucial for the algorithm to work optimally. For this we will perform hyper-parameter tuning for it, while measuring the efficiency of the model for each different value of \(k\) with cross-validation.
# For reproducibility purposes
set.seed(123)
# Setting up cross-validation
train_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Defining the tuning grid for k values
tune_grid <- expand.grid(k = 1:20) # k takes values from 1 to 20
# Training the KNN model with cross-validation
knn_model <- train(SalePrice_Category ~ ., data = data_enc, method = "knn", trControl = train_control, tuneGrid = tune_grid)
# Displaying the results
print(knn_model)
## k-Nearest Neighbors
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.6238813 0.5298143
## 2 0.6282658 0.5353174
## 3 0.6354174 0.5442558
## 4 0.6390686 0.5488309
## 5 0.6487472 0.5609226
## 6 0.6514179 0.5641511
## 7 0.6505804 0.5631394
## 8 0.6363419 0.5454197
## 9 0.6479100 0.5598105
## 10 0.6460998 0.5575504
## 11 0.6416274 0.5519713
## 12 0.6398417 0.5497407
## 13 0.6407348 0.5508617
## 14 0.6407668 0.5509258
## 15 0.6371392 0.5463975
## 16 0.6326426 0.5407471
## 17 0.6487073 0.5608201
## 18 0.6461003 0.5575694
## 19 0.6487471 0.5609226
## 20 0.6460682 0.5575376
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 6.
We see that the optimal \(k\), is 6 as it yields approximately a ~65% accuracy and a kappa of ~0.56. To visualize this model, we will train it on the whole dataset and perform PCA to visualize the classification in 2 dimensions.
# For reproducibility purposes
set.seed(123)
# Training the KNN model using the caret package
train_control <- trainControl(method = "none") # No resampling
# Defining the grid of values for k
tune_grid <- expand.grid(k = 6) # Setting k = 6 (best value obtained)
# Training the KNN model using the caret on the entire dataset
final_knn <- train(SalePrice_Category ~ ., data = data_enc, method = "knn", trControl = train_control, tuneGrid = tune_grid)
# Performing PCA on the feature columns (exclude the target variable 'Class')
pca_result <- prcomp(data_enc[, -which(names(data_enc) == "SalePrice_Category")])
# Extracting the first two principal components
pca_data <- data.frame(pca_result$x[, 1:2]) # First two PCs
pca_data$SalePrice_Category <- data_enc$SalePrice_Category # Adding class labels for visualization
# Getting predicted class labels using the trained KNN model
pca_data$Predicted_Class_kNN <- predict(final_knn, newdata = data_enc)
# Visualizing the PCA result with predicted class labels
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_kNN, shape=SalePrice_Category)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "KNN Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_kNN))))
We observe that most of the points in the class Very High have been classified correctly. We also see how Very Low has mostly been classified correctly, although there are more errors than for Very High prices. On the other hand, the model struggles to distinguish between the 3 middle classes, which could be due to many factors, but one of them might be because the classification may be non-linear. To solve this we could use another type of distance in the k-NN such as Mahalanobis, or we could apply the kernel trick. Although these options might better the prediction slightly, we do not think that they would make the model perform much better, since the errors appear to be specific flaws of the k-NN algorithm itself. Instead, what we will do is focus more on the other models that might capture better this complexity of the data.
However we will plot the SHAP values to see if we can obtain more insights and see what values have more importance in our KNN model.
# Defining a prediction wrapper for probabilities
predict_function <- function(model, newdata) {
predict(model, newdata = newdata, type = "prob")
}
# Creating the Predictor object
predictor_knn <- Predictor$new(
model = knn_model,
data = data_enc[, -which(names(data_enc) == "SalePrice_Category")],
y = data_enc$SalePrice_Category,
predict.fun = predict_function
)
# Computing SHAP values for the first observation
shap_knn <- Shapley$new(predictor_knn, x.interest = data_enc[1, -which(names(data_enc) == "SalePrice_Category")])
Plotting the results.
# Obtaining the results
shap_values_knn <- shap_knn$results
long_shap_knn <- melt(shap_values_knn)
## Using feature, class, feature.value as id variables
# Sorting by importance for better readability
long_shap_knn <- long_shap_knn %>%
mutate(feature = reorder(feature, abs(value), FUN = mean))
# Creating the violin plots
ggplot(long_shap_knn, aes(x = value, y = feature)) +
geom_violin(fill = "lightblue", scale = "width") +
ggtitle("SHAP Values for KNN") +
xlab("SHAP value (impact on model output)") +
ylab("Feature") +
theme_minimal()
The plot shows how each feature contributes to the model output. Wider distributions indicate more variability in feature impact. A violin with mean around 0 has almost no impact as its shap values are near zero. We can see that in this model MSZoning and Overall Condition have the lower SHAP values whereas ExterQual has the most impact.
The next algorithms we will see are Quadratic Discriminant Analysis and Linear Discriminant Analysis which are really similar,
The biggest difference is that Linear Discriminant Analysis assumes that the different classes in the prediction all have the same covariance matrix (This will be explained in more detail below). This is a really big difference, which could potentially be one of the reasons why we chose one process over the other.
To see if the covariance matrices are the same or not we will use Box’s M Test which tests if two or more covariance matrices are equal or not. This test assumes normality from the data if not it could have many errors.
The null hypothesis for this test states that the covariance matrices are equal, a really high p value indicates that this is true. We will use the library biotools that will perform this test. It compares the product of log determinants of the matrices to the log determinant of a pooled matrix (average of all the matrices), it then uses a chi squared approximation for the result. It is based on the following formula:
\[ M = \frac{n - k}{k \cdot (n - 1)} \sum_{i=1}^{k} (n_i - 1) \cdot \log \left( \det(\Sigma_i) \right) - \sum_{i=1}^{k} (n_i - 1) \cdot \log \left( \det(\Sigma) \right) \]
Where \(n\) is the number of observations, \(k\) the number of classes, \(\Sigma_i\) each covariance matrix, and finally the pooled matrix.
# Performing the test
boxM_test <- boxM(data_enc[,-which(names(data_enc) == "SalePrice_Category")],
grouping = data_enc$SalePrice_Category)
print(boxM_test)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_enc[, -which(names(data_enc) == "SalePrice_Category")]
## Chi-Sq (approx.) = 7148, df = 1200, p-value < 2.2e-16
This test clearly suggest that the covariance matrices are different therefore Quadratic Discrimant Analysis will more likely be better in our dataset. However, in this project we are covering many different supervised learning algorithms to study and understand them. This is why we will also cover Linear Discriminant Analysis, as Box M test may make some errors, and LDA could provide a similar classification while being less computationally expensive.
The first algorithm we will use is Quadratic Discriminant Analysis (QDA) . It is a supervised learning technique used for classification. QDA assumes that all classes follow a normal distribution they have a mean vector and a covariance matrix (different for each class), it provides a division of the data with a quadratic decision boundary.
This algorithm has problems with really high dimensionality as it needs to estimate many vectors and covariance matrices. Moreover it is also really sensitive to deviations from Gaussian distribution. It is an unbiased estimator with a high variance. This problems will be tackled by LDA, which assumes the same covariances for all classes therefore we will also test it afterwards
Mathematically QDA is based on the Bayes Theorem which calculates the probability of an element belonging to an specific class, the mathematical formula estimates mean vectors, covariance matrices and prior probabilities.
The formula is given by
\[ \delta_k(x) = -\frac{1}{2} \log |\Sigma_k| - \frac{1}{2} (x - \mu_k)^T \Sigma_k^{-1} (x - \mu_k) + \log P(y = k) \]
The first term corresponds to the logarithm of the determinant of the covariance matrix, the logarithm appears due to the derivation from Gaussian distribution. The second term measures how far is \(x\) in relation to the mean multiplied by the covariance matrix, sometimes this matrix is not singular and therefore non invertible which is another problem. We want to minimize this first two terms as much as possible. Finally, the third term corresponds to the probability of an element y belonging to a class k.
This function is computed for every observation and each point is assigned to the highest value obtained by the function:
\[ \hat{y} = \arg \max_k \delta_k(x)\ \]
We will start creating the different models
# For reproducibility purposes
set.seed(123)
# Creating the model
qda_model1 <- qda(SalePrice_Category ~ ., data_enc)
print(qda_model1)
## Call:
## qda(SalePrice_Category ~ ., data = data_enc)
##
## Prior probabilities of groups:
## Very Low Low Medium High Very High
## 0.2019660 0.2028597 0.1957105 0.1992851 0.2001787
##
## Group means:
## GrLivArea OverallQual BsmtFinSF1 TotalBsmtSF YearBuilt
## Very Low -0.89897898 -0.8084948 -0.471356884 -0.8134139 -0.860202470
## Low -0.47388813 -0.5558272 -0.006713687 -0.2514008 -0.455073696
## Medium -0.08834126 -0.1836217 0.072804608 -0.1443070 0.009245684
## High 0.34481045 0.2305487 0.042278724 0.1978131 0.512387867
## Very High 1.01597042 1.2499267 0.503369343 0.9697682 0.870050513
## X1stFlrSF LotArea OverallCond GarageArea Fireplaces X2ndFlrSF
## Very Low -0.7260584 -0.5993614 -0.02764761 -0.81105648 -0.6795843 -0.36485195
## Low -0.2158060 -0.2084054 0.26845459 -0.39207412 -0.4226555 -0.31748596
## Medium -0.1629992 -0.1370641 0.08655623 -0.03914165 0.1377177 -0.07221845
## High 0.1378146 0.1573188 -0.16478885 0.22215067 0.1861903 0.17395589
## Very High 0.8989662 0.6582182 -0.24297170 1.02627918 0.7616358 0.47111071
## YearRemodAdd MSSubClass LotFrontage GarageCars Neighborhood
## Very Low -0.7237820 -0.056265673 -0.52577977 -0.9163443 -0.96358962
## Low -0.3415088 -0.012173245 -0.13263724 -0.4039114 -0.54833368
## Medium -0.0118478 0.177075402 -0.01255670 0.1969817 -0.07434321
## High 0.3776968 -0.002068079 0.04214177 0.3358237 0.50182262
## Very High 0.7839652 -0.179762739 0.60607385 0.8069376 1.10097195
## GarageType FireplaceQu BldgType ExterQual BsmtQual MSZoning
## Very Low -0.9785726 -0.66755421 -0.29778165 -0.7127374 -0.90052052 -0.7547616
## Low -0.3902583 -0.43346509 -0.18599517 -0.5536524 -0.45547169 -0.2039545
## Medium 0.1779047 0.06239203 0.04739346 -0.2847070 -0.02340936 0.1586161
## High 0.5343423 0.17086826 0.18969736 0.4524897 0.41719665 0.3690436
## Very High 0.6769044 0.88168014 0.25374053 1.1080508 0.97768531 0.4457149
## HouseStyle KitchenQual
## Very Low -0.4828788 -0.7502743
## Low -0.4286188 -0.5057292
## Medium 0.1608942 -0.1787950
## High 0.3357419 0.3777541
## Very High 0.4300035 1.0682119
We can see that the prior probabilities for each of the classes is around 0.2 for each of the classes, basically equal chances of being assigned to each of the classes. We already knew this as it was assigned an equal distribution to each group at the beginning. Moreover, we can see the mean value obtained for each of the variables with respect to the classes. Variables such as Area and Quality have a higher value for Very High sale price and small values for low prices which makes sense.
We will use cross validation and create a model based on the Quadratic Discriminant Analysis
# For reproducibility purposes
set.seed(123)
# Setting up cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Training the QDA model with cross-validation
qda_model <- train(SalePrice_Category ~ ., data = data_enc, method = "qda", trControl = train_control)
# Displaying the results
print(qda_model)
## Quadratic Discriminant Analysis
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6453031 0.5565628
We can see that the value obtained for accuracy is around 0.64 and around 0.55 for the kappa value which are somewhat decent values and better than doing it by random chance but we could improve this values for sure.
We will now compare with LDA and then jump to conclusions.
We have already briefly introduced Linear Discriminant Analysis (LDA), it is also a supervised learning algorithm used for classification. It eliminates some of the QDA problems, it does this by introducing some bias to reduce variance thereby having a lower prediction error. The division of data is done linearly.
Same as QDA, it assumes a normal distribution but the main difference and the main reason why problems such as really high dimensionality are solved is that it assumes the covariance matrices are equal. This is because if there are K classes we would only need K-1 discriminant functions as one class could be inferred from others.
\[ \delta'_k(x) = \delta_k(x) - \delta_K(x) \]
Moreover it estimates p+1 parameters (transformation of the original parameters) to make the discriminant function. In Big O notation this is \(O(p)\). But QDA has to estimate the mean for each parameter \(p\), compute \(\dfrac{p(p+1)}{2}\) covariance matrices and the probabilities. Which is why for a large \(p\) LDA will be more practical. In Big O notation this is \(O(p^2)\).
For classifying it follows the same procedure as QDA of the discriminant function. The biggest difference is just that it computes only one matrix. Same as before it will classify according to the highest value obtained.
Creating our model and training
# For reproducibility purposes
set.seed(123)
# Creating the model
lda_model1 <- lda(SalePrice_Category~., data_enc)
print(lda_model1)
## Call:
## lda(SalePrice_Category ~ ., data = data_enc)
##
## Prior probabilities of groups:
## Very Low Low Medium High Very High
## 0.2019660 0.2028597 0.1957105 0.1992851 0.2001787
##
## Group means:
## GrLivArea OverallQual BsmtFinSF1 TotalBsmtSF YearBuilt
## Very Low -0.89897898 -0.8084948 -0.471356884 -0.8134139 -0.860202470
## Low -0.47388813 -0.5558272 -0.006713687 -0.2514008 -0.455073696
## Medium -0.08834126 -0.1836217 0.072804608 -0.1443070 0.009245684
## High 0.34481045 0.2305487 0.042278724 0.1978131 0.512387867
## Very High 1.01597042 1.2499267 0.503369343 0.9697682 0.870050513
## X1stFlrSF LotArea OverallCond GarageArea Fireplaces X2ndFlrSF
## Very Low -0.7260584 -0.5993614 -0.02764761 -0.81105648 -0.6795843 -0.36485195
## Low -0.2158060 -0.2084054 0.26845459 -0.39207412 -0.4226555 -0.31748596
## Medium -0.1629992 -0.1370641 0.08655623 -0.03914165 0.1377177 -0.07221845
## High 0.1378146 0.1573188 -0.16478885 0.22215067 0.1861903 0.17395589
## Very High 0.8989662 0.6582182 -0.24297170 1.02627918 0.7616358 0.47111071
## YearRemodAdd MSSubClass LotFrontage GarageCars Neighborhood
## Very Low -0.7237820 -0.056265673 -0.52577977 -0.9163443 -0.96358962
## Low -0.3415088 -0.012173245 -0.13263724 -0.4039114 -0.54833368
## Medium -0.0118478 0.177075402 -0.01255670 0.1969817 -0.07434321
## High 0.3776968 -0.002068079 0.04214177 0.3358237 0.50182262
## Very High 0.7839652 -0.179762739 0.60607385 0.8069376 1.10097195
## GarageType FireplaceQu BldgType ExterQual BsmtQual MSZoning
## Very Low -0.9785726 -0.66755421 -0.29778165 -0.7127374 -0.90052052 -0.7547616
## Low -0.3902583 -0.43346509 -0.18599517 -0.5536524 -0.45547169 -0.2039545
## Medium 0.1779047 0.06239203 0.04739346 -0.2847070 -0.02340936 0.1586161
## High 0.5343423 0.17086826 0.18969736 0.4524897 0.41719665 0.3690436
## Very High 0.6769044 0.88168014 0.25374053 1.1080508 0.97768531 0.4457149
## HouseStyle KitchenQual
## Very Low -0.4828788 -0.7502743
## Low -0.4286188 -0.5057292
## Medium 0.1608942 -0.1787950
## High 0.3357419 0.3777541
## Very High 0.4300035 1.0682119
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## GrLivArea 0.70005004 -1.6148596052 -1.04691655 0.114682480
## OverallQual 0.34262743 0.7396336259 0.52807595 0.762074949
## BsmtFinSF1 0.17688493 -0.0003622333 0.38311644 -0.033076731
## TotalBsmtSF 0.29027639 -0.0133412383 0.37072401 -0.292588865
## YearBuilt 0.44452315 -0.2389854244 -0.02347642 -0.563028020
## X1stFlrSF 0.08999768 1.0032064172 0.74099821 -0.513724435
## LotArea 0.25027980 0.1381511905 -0.08932354 -0.357553255
## OverallCond 0.26101322 -0.3504503147 0.39681261 -0.453144656
## GarageArea 0.25905610 0.5009420497 0.10305809 0.113956698
## Fireplaces 0.24521768 -0.5428960404 -0.43909857 0.603549536
## X2ndFlrSF 0.18246036 1.4700372419 0.81231589 -0.644040291
## YearRemodAdd 0.08963743 -0.1334493590 0.22443314 0.162427018
## MSSubClass 0.02171808 -0.3903826178 0.55103299 -0.072604293
## LotFrontage -0.04877953 -0.0519883874 0.42453547 0.202744728
## GarageCars -0.02124012 -0.6094304542 0.24038675 0.385007964
## Neighborhood 0.31394354 -0.0075408732 -0.30973387 -0.076260783
## GarageType 0.09207639 -0.3923155376 -0.25664662 -0.257512516
## FireplaceQu 0.01407830 0.5827060971 0.51200661 0.017980960
## BldgType 0.16674264 -0.2539222740 0.08483753 0.018528824
## ExterQual 0.06200978 0.4162309199 -0.66946635 -0.433066116
## BsmtQual 0.19392753 -0.2794168047 -0.11192692 0.005888135
## MSZoning 0.08170953 -0.4284535170 0.18493085 -0.047246484
## HouseStyle 0.05494709 -0.0874078919 -0.18729932 0.496323483
## KitchenQual 0.18826069 0.1598340732 -0.27736382 0.052499415
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.9125 0.0635 0.0167 0.0073
We can see in the output that the prior probabilities for each of the classes is around 0.2 for all of them. Apart from that we can also see the mean values for each class in the target variable. There are many negative values due to the normalization which has been done before. We can see that for example for the Very High Category, variables such as the area or the quality have high positive values whereas for the very low category this are negative.
We can also see the coefficients of linear discriminant which could be used to obtain the discriminant function or for dimensionality reduction as LDA is also used in this area. Based on this output we do not see much difference to the QDA done before, they provide really similar values.
Now we will use cross validation to obtain the accuracy and kappa.
# For reproducibility purposes
set.seed(123)
# Set up cross-validation
train_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Train the QDA model with cross-validation
lda_model <- train(SalePrice_Category ~ ., data = data_enc, method = "lda", trControl = train_control)
# Display the results
print(lda_model)
## Linear Discriminant Analysis
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results:
##
## Accuracy Kappa
## 0.696239 0.620249
The accuracy is of around 0.69 and a value of ~0.62 for kappa which is an increase with respect to the previous result we obtained
The results obtained are surprising if we take Box’s M test into consideration which returned a really low p value which indicated with great confidence that the covariance matrices were equal and this led us to believe that the results for QDA would be better.
After training both models we saw that this in fact was not true as LDA returns a better result. This could be due to an error in the Box M test which may make errors, there could also be potential overfitting from QDA as it estimates a separate covariance matrix and it might fit noise in the data.
Another possible option is that QDA is not regularized which is usually when the number of features is really high in relation to the number observations. Regular Discriminant Analysis offers a solution to this by reducing complexity and avoiding overfitting.
This also support our idea of training the different models as we could not completely guarantee that QDA would perform better.
Regularized Discriminant Analysis (RDA) is another version which could be used both for QDA and LDA. Its main idea is that it includes a regularization term to prevent overfitting by shrinking the covariance matrices, if elements from different classes overlap changing this term will improve accuracy, this parameter is gamma and it helps shrink towards a more stable value. If its value is 0, no regularization is done. Its formula is given by:
\[ \Sigma_k^\text{reg} = (1 - \gamma) \Sigma_k + \gamma \Sigma_{\text{global}} \]
The first term is the regularization with gamma times the covariance matrix for each class \(k\) and the regularization matrix times the pooled matrix (average of all the matrices). Apart from that, we can also regularize the mean vector with \(\lambda\), it applies a penalty to mean vectors towards 0 or a common mean. A value of 0 indicates no regularization. Its formula is given by:
\[ \mu_k^\gamma = (1 - \gamma) \hat{\mu}_k + \gamma \bar{x} \]
Regularization times the mean vector for each class added to lambda times the mean vector of all the classes. All this results in the final formula given by:
\[ \delta_k(x) = \ln \pi_k - \frac{1}{2} \ln |\Sigma_k^\lambda| - \frac{1}{2} (x - \mu_k^\gamma)^T \Sigma_k^{\lambda^{-1}} (x - \mu_k^\gamma) \]
Each point will be assigned according to the discriminant function which returns a higher value.
In this code we could try many values for gamma and lambda, the complexity for this is \(O(n^n)\)as each value for lambda is tested with each value for gamma, therefore we chose a few numbers as it returns good values and it will not take as much time.
# For reproducibility purposes
set.seed(123)
# Setting up cross-validation with 10 folds
train_control <- trainControl(method = "cv", number = 10)
# Defining a tuning grid for both gamma and lambda
tune_grid <- expand.grid(gamma = seq(0, 1, by = 0.3),
lambda = seq(0, 1, by = 0.3))
# Training the regularized QDA model with cross-validation and tuning
rda_model <- train(SalePrice_Category ~ ., data = data_enc, method = "rda",
trControl = train_control, tuneGrid = tune_grid)
# Displaying the results
print(rda_model$results)
## gamma lambda Accuracy Kappa AccuracySD KappaSD
## 1 0.0 0.0 0.6453031 0.5565628 0.03951945 0.04933169
## 2 0.0 0.3 0.6729835 0.5911945 0.03593218 0.04482525
## 3 0.0 0.6 0.6890880 0.6113408 0.03747160 0.04673346
## 4 0.0 0.9 0.6917904 0.6147080 0.04183405 0.05221642
## 5 0.3 0.0 0.6639982 0.5799314 0.04512008 0.05634163
## 6 0.3 0.3 0.6748252 0.5934731 0.04658444 0.05816070
## 7 0.3 0.6 0.6900366 0.6125032 0.05548687 0.06929057
## 8 0.3 0.9 0.6900285 0.6125169 0.04141045 0.05166851
## 9 0.6 0.0 0.6631536 0.5788941 0.05765142 0.07201830
## 10 0.6 0.3 0.6774961 0.5968412 0.05022791 0.06271408
## 11 0.6 0.6 0.6837383 0.6046269 0.04707282 0.05879922
## 12 0.6 0.9 0.6819924 0.6024891 0.04456779 0.05560575
## 13 0.9 0.0 0.6667490 0.5833722 0.04552944 0.05692334
## 14 0.9 0.3 0.6712537 0.5890222 0.05189208 0.06483217
## 15 0.9 0.6 0.6686306 0.5857660 0.05595618 0.06988890
## 16 0.9 0.9 0.6668525 0.5835528 0.05037612 0.06289765
# Obtaining the best parameters
best_result <- rda_model$results[which.max(rda_model$results$Accuracy), ]
# Returning the results
cat("Best Accuracy: ", best_result$Accuracy,"\n")
## Best Accuracy: 0.6917904
cat("Best Gamma: ", best_result$gamma,"\n")
## Best Gamma: 0
cat("Best Lambda: ", best_result$lambda,"\n")
## Best Lambda: 0.9
We can see that we obtained an accuracy of around 0.7 which is a clear improvement with respect to the previous QDA we performed and it obtains a greater accuracy than LDA. This supports the initial conclusion from the Box M test which stated different covariance matrices.
We will now make some boxplots to create a visual representation of the results obtained and compare between the different trained models.
# Obtaining the performance metrics for all the different models
rda_res <- rda_model$resample
lda_res <- lda_model$resample
qda_res <- qda_model$resample
# Combining the results into one data frame
resample_data <- rbind(
data.frame(Model = "RDA", Accuracy = rda_res$Accuracy),
data.frame(Model = "LDA", Accuracy = lda_res$Accuracy),
data.frame(Model = "QDA", Accuracy = qda_res$Accuracy)
)
# Plotting accuracy across folds
ggplot(resample_data, aes(x = Model, y = Accuracy, color = Model)) +
geom_boxplot() +
labs(title = "Cross-Validation Accuracy Comparison", x = "Model", y = "Accuracy") +
theme_minimal()
The boxplots show that clearly the QDA has a wider spread and a lower median accuracy, LDA does seem to work well and its values are not bad although the median is below 0.7. RDA seems to have a bit better median and the results are clearly as not as spread as the others. Its mean accuracy is around 0.7 which are values not usually obtained by both QDA and LDA.
Following the same procedure as KNN we will use PCA to visualize the predictions.
pca_data$Predicted_Class_rda <- predict(rda_model, newdata = data_enc)
# Visualize the PCA result with predicted class labels
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_rda, shape=SalePrice_Category)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "RDA Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_rda)))) # Customize colors for predicted classes
The results obtained are really similar to those of the KNN, clear distinction for Very High and Very Low with some errors. There are more problems for the classification of the middle classes.
# Defining a prediction wrapper for probabilities
predict_function <- function(model, newdata) {
predict(model, newdata = newdata, type = "prob")
}
# Creating the Predictor object
predictor_rda <- Predictor$new(
model = rda_model,
data = data_enc[, -which(names(data_enc) == "SalePrice_Category")],
y = data_enc$SalePrice_Category,
predict.fun = predict_function
)
# Computing SHAP values for the first observation
shap_rda <- Shapley$new(predictor_rda, x.interest = data_enc[1, -which(names(data_enc) == "SalePrice_Category")])
Plotting the results.
# Obtaining the results
shap_values_rda <- shap_rda$results
long_shap_rda <- melt(shap_values_rda)
## Using feature, class, feature.value as id variables
# Sorting by importance
long_shap_rda <- long_shap_rda %>%
mutate(feature = reorder(feature, abs(value), FUN = mean))
# Creating the violin plots
ggplot(long_shap_rda, aes(x = value, y = feature)) +
geom_violin(fill = "lightblue", scale = "width") +
ggtitle("SHAP Values for Regularized Discriminant Analysis") +
xlab("SHAP value (impact on model output)") +
ylab("Feature") +
theme_minimal()
The plot shows how each feature contributes to the model output. We see how there are many variables that contribute very little to the prediction such as BldgType. We also observe that the variables that seem to drive mostly the predictions ar GrLiveArea, X2ndFlrSF and ExterQual. This is surprising since we did not see the area of the 2nd floor before (X2ndFlrSF), but the model does a pretty good job taking it into account.
Naive Bayes is a conditional probability model based on the Bayes theorem. It assigns a probability that an observation \(\mathbf{x}\) belongs to each possible class \(C_k\). Mathematically:
\[ p(C_k\mid\mathbf{x})=\dfrac{p(C_k)p(\mathbf{x}\mid C_k)}{p(\mathbf{x})} \]
Where \(k\) is the number possible classes, and the vector \(\mathbf{x}\) has \(n\) features. In the end, the classifier decides the class based on the one that has the higher posterior probability. We then end up with the following prediction \(\hat{y}\) :
\[ \hat{y} = \underset{k \in \{1, \dots, K\}}{\operatorname{argmax}} \ p(C_k) \prod_{i=1}^n p(x_i \mid C_k). \]
Since we are mostly dealing with continuous data, and after the scaling and transformation, we will use Gaussian Naive Bayes. For this model we assume each class follows a normal distribution. Then, suppose observation value \(v\), the probability density of it being in \(C_k\) is computed as follows (based on the normal distribution):
\[ p(x = v \mid C_k) = \frac{1}{\sqrt{2\pi \sigma_k^2}} \, e^{-\frac{(v - \mu_k)^2}{2\sigma_k^2}} \]
Where \(\mu_k\) and \(\sigma_k^2\) represent the mean and variance of the feature values in the class \(C_k\).
Although it is a somewhat simple kind of model, it is good to try it before moving to more fancy methods. Furthermore, since the outputs are probabilities we can obtain valuable interpretations from it.
We will firstly perform a small hyper-parameter tuning to optimize the model’s results, while measuring how well it performs with a 10 fold cross-validation.
# For reproducibility purposes
set.seed(123)
# Define the trainControl object for cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Train Naive Bayes with caret
suppressWarnings({
nb_model <- train(SalePrice_Category ~ ., data = data_enc, method = "nb", trControl = train_control, tuneGrid = expand.grid(fL = c(0, 0.5, 1), usekernel = c(TRUE, FALSE), adjust = c(1, 1.5)))
})
# View the model
print(nb_model)
## Naive Bayes
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results across tuning parameters:
##
## fL usekernel adjust Accuracy Kappa
## 0.0 FALSE 1.0 0.6605625 0.5755632
## 0.0 FALSE 1.5 0.6605625 0.5755632
## 0.0 TRUE 1.0 0.6478778 0.5597317
## 0.0 TRUE 1.5 0.6354087 0.5440482
## 0.5 FALSE 1.0 0.6605625 0.5755632
## 0.5 FALSE 1.5 0.6605625 0.5755632
## 0.5 TRUE 1.0 0.6478778 0.5597317
## 0.5 TRUE 1.5 0.6354087 0.5440482
## 1.0 FALSE 1.0 0.6605625 0.5755632
## 1.0 FALSE 1.5 0.6605625 0.5755632
## 1.0 TRUE 1.0 0.6478778 0.5597317
## 1.0 TRUE 1.5 0.6354087 0.5440482
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
## = 1.
We observe that the best model has a ~66% accuracy and a Kappa of ~0.57 which has been very common in different algorithms throughout this project. This can be due to many reasons, but again there are inherent problems with the model and our data.
Firstly, each class is assumed to be normally distributed, and although we have scaled and transformed the data, we could observe how the middle class is normally distributed while maybe the “Very High” class is skewed. Moreover, we also assume independence between each class which is not correct, since for example the overall quality (OverallQual) and exterior quality (ExterQual) are clearly related, we observed this in the correlation plots we did right at the start.
Now we will move to the interpretation of the predictions. Here we print the mean and standard deviation for each class and feature. We would expect an increase or decrease in the mean as the price gets higher, while the standard deviation stays the same.
# Train model with klaR to extract probabilities
nb_model_klar <- NaiveBayes(SalePrice_Category ~ ., data = data_enc)
# View conditional probabilities for features
nb_model_klar$tables
## $GrLivArea
## [,1] [,2]
## Very Low -0.89897898 0.8046418
## Low -0.47388813 0.7870503
## Medium -0.08834126 0.6061179
## High 0.34481045 0.5471219
## Very High 1.01597042 0.6267185
##
## $OverallQual
## [,1] [,2]
## Very Low -0.8084948 0.4590541
## Low -0.5558272 0.4035994
## Medium -0.1836217 0.4853921
## High 0.2305487 0.4992414
## Very High 1.2499267 0.9506974
##
## $BsmtFinSF1
## [,1] [,2]
## Very Low -0.471356884 0.6219133
## Low -0.006713687 0.7181262
## Medium 0.072804608 0.7621589
## High 0.042278724 0.8715712
## Very High 0.503369343 1.1365844
##
## $TotalBsmtSF
## [,1] [,2]
## Very Low -0.8134139 0.8010905
## Low -0.2514008 0.7466992
## Medium -0.1443070 0.7836814
## High 0.1978131 0.8295028
## Very High 0.9697682 0.7510355
##
## $YearBuilt
## [,1] [,2]
## Very Low -0.860202470 0.7061909
## Low -0.455073696 0.7115357
## Medium 0.009245684 0.8382566
## High 0.512387867 0.7698071
## Very High 0.870050513 0.7622860
##
## $X1stFlrSF
## [,1] [,2]
## Very Low -0.7260584 0.8126092
## Low -0.2158060 0.7180302
## Medium -0.1629992 0.9040488
## High 0.1378146 0.8691674
## Very High 0.8989662 0.7585104
##
## $LotArea
## [,1] [,2]
## Very Low -0.5993614 0.8806203
## Low -0.2084054 0.8230830
## Medium -0.1370641 1.0010815
## High 0.1573188 0.9152732
## Very High 0.6582182 0.7479319
##
## $OverallCond
## [,1] [,2]
## Very Low -0.02764761 1.0366378
## Low 0.26845459 1.0572647
## Medium 0.08655623 0.9996954
## High -0.16478885 0.8173155
## Very High -0.24297170 0.6912844
##
## $GarageArea
## [,1] [,2]
## Very Low -0.81105648 0.8524017
## Low -0.39207412 0.8002536
## Medium -0.03914165 0.6700815
## High 0.22215067 0.6025207
## Very High 1.02627918 0.7121573
##
## $Fireplaces
## [,1] [,2]
## Very Low -0.6795843 0.7388621
## Low -0.4226555 0.9167488
## Medium 0.1377177 0.9918790
## High 0.1861903 0.9741225
## Very High 0.7616358 0.6168348
##
## $X2ndFlrSF
## [,1] [,2]
## Very Low -0.36485195 0.5805863
## Low -0.31748596 0.6395696
## Medium -0.07221845 0.7558173
## High 0.17395589 0.9846102
## Very High 0.47111071 1.3107925
##
## $YearRemodAdd
## [,1] [,2]
## Very Low -0.7237820 0.8616076
## Low -0.3415088 0.8513848
## Medium -0.0118478 0.8004722
## High 0.3776968 0.7096437
## Very High 0.7839652 0.7580244
##
## $MSSubClass
## [,1] [,2]
## Very Low -0.056265673 1.0708712
## Low -0.012173245 1.0664526
## Medium 0.177075402 1.1017917
## High -0.002068079 0.8644107
## Very High -0.179762739 0.7967863
##
## $LotFrontage
## [,1] [,2]
## Very Low -0.52577977 0.9290979
## Low -0.13263724 0.8164691
## Medium -0.01255670 1.0114366
## High 0.04214177 0.8503142
## Very High 0.60607385 0.9598913
##
## $GarageCars
## [,1] [,2]
## Very Low -0.9163443 1.3320850
## Low -0.4039114 0.9085143
## Medium 0.1969817 0.4718565
## High 0.3358237 0.3916248
## Very High 0.8069376 0.4407288
##
## $Neighborhood
## [,1] [,2]
## Very Low -0.96358962 0.5751379
## Low -0.54833368 0.6409721
## Medium -0.07434321 0.7710212
## High 0.50182262 0.6455013
## Very High 1.10097195 0.7385760
##
## $GarageType
## [,1] [,2]
## Very Low -0.9785726 0.9477950
## Low -0.3902583 0.9809476
## Medium 0.1779047 0.8299203
## High 0.5343423 0.5925598
## Very High 0.6769044 0.4545569
##
## $FireplaceQu
## [,1] [,2]
## Very Low -0.66755421 0.7397864
## Low -0.43346509 0.8951963
## Medium 0.06239203 0.9371531
## High 0.17086826 0.9465590
## Very High 0.88168014 0.6647050
##
## $BldgType
## [,1] [,2]
## Very Low -0.29778165 1.3623472
## Low -0.18599517 1.3159675
## Medium 0.04739346 0.8917715
## High 0.18969736 0.5714694
## Very High 0.25374053 0.2350869
##
## $ExterQual
## [,1] [,2]
## Very Low -0.7127374 0.4499156
## Low -0.5536524 0.4887370
## Medium -0.2847070 0.7787261
## High 0.4524897 0.9030459
## Very High 1.1080508 0.9044292
##
## $BsmtQual
## [,1] [,2]
## Very Low -0.90052052 0.9309911
## Low -0.45547169 0.7272669
## Medium -0.02340936 0.7664980
## High 0.41719665 0.6102603
## Very High 0.97768531 0.7000287
##
## $MSZoning
## [,1] [,2]
## Very Low -0.7547616 1.3717813
## Low -0.2039545 1.0468415
## Medium 0.1586161 0.8656508
## High 0.3690436 0.4758747
## Very High 0.4457149 0.2570120
##
## $HouseStyle
## [,1] [,2]
## Very Low -0.4828788 1.1263708
## Low -0.4286188 0.9599538
## Medium 0.1608942 0.8779752
## High 0.3357419 0.8486101
## Very High 0.4300035 0.7627531
##
## $KitchenQual
## [,1] [,2]
## Very Low -0.7502743 0.6816811
## Low -0.5057292 0.7325214
## Medium -0.1787950 0.8228022
## High 0.3777541 0.7168954
## Very High 1.0682119 0.8223207
Although our initial guess might be correct at first we observe different cases where the means do not follow the suggested order, like in MSSubClass or OverallCond. Furthermore we see that there is a very noticeable gap in means between Very Low, Very High and the rest, while the middle 3 classes sometimes do not follow the order we discussed before, and they have really similar mean. This combined with the standard deviations, which are moreless the same for each class, makes the 3 middle classes very likely to be predicted wrong, just as we saw with each previous model.
We will finally do a plot with PCA to compare the predicted class with the real one. We will also highlight the points where the posterior probability for the predicted class has been less than 0.6, since they should be uncertain points for the model, which could be between classes.
# Predict on the training dataset
suppressWarnings({
predictions <- predict(nb_model, data_enc, type = "prob")
})
# Identify uncertain points
uncertainty_threshold <- 0.6
pca_data$Uncertain <- apply(predictions, 1, max) < uncertainty_threshold
# Step 2: Add the predicted class labels to the PCA data
suppressWarnings({
pca_data$Predicted_Class_nb <- predict(nb_model, newdata = data_enc)
})
# Step 3: Update the ggplot visualization to highlight uncertain points
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_nb, shape = SalePrice_Category)) +
geom_point(aes(size = !Uncertain), alpha = 0.7) +
scale_size_manual(values = c(`TRUE` = 1, `FALSE` = 5), guide = "none") +
labs(title = "Naive Bayes Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_nb))))
We see in the plot that many of the “uncertain” points are in the boundary between classes. We also observe that many of these points are all over the middle, since as we have said all throughout the project, it is hard to classify the points in the middle, since a point with High price could be close to one with Low price. Furthermore, many of the points that are uncertain are wrongly classified.
Now we will calculate the SHAP values to further interpret the classification.
# Define a prediction wrapper for probabilities (if required)
predict_function <- function(model, newdata) {
predict(model, newdata = newdata, type = "prob")
}
suppressWarnings({
# Create the Predictor object
predictor_nb <- Predictor$new(
model = nb_model,
data = data_enc[, -which(names(data_enc) == "SalePrice_Category")],
y = data_enc$SalePrice_Category,
predict.fun = predict_function
)
# Computing SHAP values for the first Instance
shap_nb <- Shapley$new(predictor_nb, x.interest = data_enc[1, -which(names(data_enc) == "SalePrice_Category")]) })
Plotting the results.
# Obtaining the results
shap_values_nb <- shap_rda$results
long_shap_nb <- melt(shap_values_nb)
## Using feature, class, feature.value as id variables
# Sorting by importance
long_shap_nb <- long_shap_nb %>%
mutate(feature = reorder(feature, abs(value), FUN = mean))
# Creating the violin plots
ggplot(long_shap_nb, aes(x = value, y = feature)) +
geom_violin(fill = "lightblue", scale = "width") +
ggtitle("SHAP Values for Naive Bayes") +
xlab("SHAP value (impact on model output)") +
ylab("Feature") +
theme_minimal()
We observe yet again how variables related to area and quality are the ones that drive the most the predictions. This has been a common trend across models, so the difference in accuracy should be in finer details.
The next algorithm we will use is Logistic Regression. Logistic regression can be used for regression and classification, and there are different types of Logistic Regression. For classification, these are Binary, Multinomial and Ordinal. Binary Logistic Regression has only two possible outcomes, Multinomial has more than two possible outcomes and Ordinal has more than two possible outcomes but with the difference that the possible outcomes have an order.
In this project, we have divided the Price into Very Low, Low, Medium, High, Very High. Clearly, there is an ordering of the classes and there are more than two classes, therefore the best logistic model will be Ordinal. Ordinal Logistic Regression’s idea is based on the previous models which we will briefly explain to obtain a general view and then focus on our model.
Logistic regression maps the different futures to probabilities using a sigmoid function to make sure the output is between 0 and 1. It calculates the sigmoid function with a combination of the different variables, which is similar to the regression model. This formula is given by:
\[ \pi(X) = P(Y = 1 \mid X) = \frac{1}{1 + e^{-z}} \] Where: \[ z = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \dots + \beta_n X_n \]
After having all values between 0 and 1, we will use this probability to obtain the odds, which are the relation between the probability of success and failure.
\[ % Sigmoid Function\[\pi(X) = P(Y = 1 \mid X) = \frac{1}{1 + e^{-z}}\] Where: \[z = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \dots + \beta_n X_n\] \]\[\text{Odds} = \frac{\pi(X)}{1 - \pi(X)}\]
But we want the relationship to be linear combination of the variables, based on the previous formula, if we apply the logarithm to get rid of the exponential we will obtain what we want \[\text{Logit}(\pi(X)) = \log\left( \frac{\pi(X)}{1 - \pi(X)} \right) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \dots + \beta_n X_n\]
Multinomial Logistic Regression uses the Softmax Function which is a generalization of the logistic function in more dimensions. It takes a vector of x numbers and normalizes it into a probability distribution. The output will return components in the interval (0,1) where the components add up to 1. Given by the formula:
\[ \pi_k(X) = \frac{\exp(\beta_{0,k} + \beta_1^T X)}{1 + \sum_{j=1}^{G-1} \exp(\beta_{0,j} + \beta_j^T X)} \]
The groups, \(G\) , in Multinomial Linear Regression go from 0 to \(G-1\) where the first group is the reference group \(y\) = 0, this could also be the last. The reference class does not have its own betas, it is represented in the denominator. The coefficients for the other groups show how the different variables are related to being in that outcome group versus the reference group. The choosing of this class is important as it affects the coefficients.
Ordinal Logistic Regression has the same groups but they are ordered. We have the function which is a cumulative probability model which measures the probability of being in a category or higher based on the predictors \(X\). The main difference is that its probabilities are derived from cumulative probabilities as each category has a threshold whereas Multinomial calculated them independently. The formula is given by
\[ \sum_{g=1}^{G^*} \rho_g = \frac{\exp(\beta_{0,g} + \beta^T X)}{1 + \exp(\beta_{0,g} + \beta^T X)} \]
As explained before we have more than one class, therefore we will not be able to use binary logistic regression. The different groups here show a clear order. But on this project we want to try many different algorithms to understand and interpret them, therefore we will train both Multinomial and Ordinal Logistic regression.
# For reproducibility purposes
set.seed(123)
# Creating cross validation
train_control <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
# Training the multinomial model with cross-validation
multinomial_model <- train(SalePrice_Category ~ ., data = data_not_enc, method = "multinom", trControl = train_control, trace = FALSE)
# Trace is added to avoid unnecesary output
# Display the results
print(multinomial_model)
## Penalized Multinomial Regression
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results across tuning parameters:
##
## decay Accuracy Kappa
## 0e+00 0.6818888 0.6023133
## 1e-04 0.6809959 0.6011975
## 1e-01 0.6998588 0.6247975
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 0.1.
In this output we see that the multinomial regression is regularized and it uses a decay to obtain better values. Same as seen on QDA, the regularization parameter prevents overfitting and fixes complexity. The accuracy obtained was of ~0.69 and a kappa of ~0.62 which is a good value and it is slightly better than the models we have seen until now.
We will know use Ordinal Logistic Regression, we will focus more on this model as at first hand it will return the best values
# For reproducibility purposes
set.seed(123)
ordinal_model1 <- polr(SalePrice_Category ~ ., data = data_not_enc, Hess = TRUE)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## view a summary of the model
summary(ordinal_model1)
## Call:
## polr(formula = SalePrice_Category ~ ., data = data_not_enc, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## GrLivArea 2.395068 4.781e-01 5.009e+00
## OverallQual 1.642622 2.002e-01 8.207e+00
## BsmtFinSF1 0.810764 1.197e-01 6.773e+00
## TotalBsmtSF 0.842403 2.092e-01 4.026e+00
## YearBuilt 1.430122 2.265e-01 6.314e+00
## X1stFlrSF -0.274057 4.029e-01 -6.802e-01
## LotArea 0.616475 1.269e-01 4.858e+00
## OverallCond 0.817645 1.151e-01 7.103e+00
## GarageArea 0.512972 1.830e-01 2.803e+00
## Fireplaces 0.435116 4.148e-01 1.049e+00
## X2ndFlrSF 0.541715 3.564e-01 1.520e+00
## YearRemodAdd 0.285188 1.428e-01 1.997e+00
## MSSubClass -0.291968 3.485e-01 -8.379e-01
## LotFrontage -0.072903 1.104e-01 -6.603e-01
## GarageCars 0.471435 2.683e-01 1.757e+00
## NeighborhoodBlueste -1.009431 1.883e+00 -5.362e-01
## NeighborhoodBrDale -1.673862 1.560e+00 -1.073e+00
## NeighborhoodBrkSide 2.347143 1.039e+00 2.260e+00
## NeighborhoodClearCr 2.366162 9.948e-01 2.378e+00
## NeighborhoodCollgCr 1.082712 8.022e-01 1.350e+00
## NeighborhoodCrawfor 2.739972 9.697e-01 2.826e+00
## NeighborhoodEdwards -0.018495 8.929e-01 -2.071e-02
## NeighborhoodGilbert 0.941771 8.426e-01 1.118e+00
## NeighborhoodIDOTRR 2.259488 1.218e+00 1.855e+00
## NeighborhoodMeadowV -1.941708 1.283e+00 -1.514e+00
## NeighborhoodMitchel -0.198176 8.913e-01 -2.223e-01
## NeighborhoodNAmes 0.182452 8.520e-01 2.142e-01
## NeighborhoodNoRidge 2.572254 1.241e+00 2.073e+00
## NeighborhoodNPkVill -0.336591 1.189e+00 -2.831e-01
## NeighborhoodNridgHt 0.886207 9.294e-01 9.535e-01
## NeighborhoodNWAmes -0.004995 8.651e-01 -5.774e-03
## NeighborhoodOldTown -0.120744 1.109e+00 -1.088e-01
## NeighborhoodSawyer 0.505766 8.936e-01 5.660e-01
## NeighborhoodSawyerW 0.820401 8.456e-01 9.702e-01
## NeighborhoodSomerst 0.965521 1.151e+00 8.386e-01
## NeighborhoodStoneBr 1.424224 1.123e+00 1.268e+00
## NeighborhoodSWISU 1.311093 1.106e+00 1.185e+00
## NeighborhoodTimber -0.094028 9.494e-01 -9.904e-02
## NeighborhoodVeenker 1.091653 1.090e+00 1.002e+00
## GarageTypeAttchd 0.545742 1.042e+00 5.236e-01
## GarageTypeBasment -1.134900 1.276e+00 -8.896e-01
## GarageTypeBuiltIn 0.303708 1.147e+00 2.649e-01
## GarageTypeCarPort -2.812819 2.832e+00 -9.931e-01
## GarageTypeDetchd -0.068277 1.042e+00 -6.551e-02
## GarageTypeNone 0.777083 1.363e+00 5.702e-01
## FireplaceQuFa -12.838679 4.663e+00 -2.753e+00
## FireplaceQuGd -12.821177 4.651e+00 -2.757e+00
## FireplaceQuNone -13.029531 4.685e+00 -2.781e+00
## FireplaceQuPo -13.731948 4.674e+00 -2.938e+00
## FireplaceQuTA -13.022358 4.650e+00 -2.801e+00
## BldgType2fmCon 0.232026 1.122e+00 2.068e-01
## BldgTypeDuplex -1.943054 7.295e-01 -2.664e+00
## BldgTypeTwnhs 0.366948 1.001e+00 3.667e-01
## BldgTypeTwnhsE 1.157576 9.290e-01 1.246e+00
## ExterQualFa -26.099074 2.456e-09 -1.062e+10
## ExterQualGd 0.728635 1.201e+00 6.067e-01
## ExterQualTA 0.871581 1.190e+00 7.326e-01
## BsmtQualFa -1.211386 8.806e-01 -1.376e+00
## BsmtQualGd -1.288800 5.787e-01 -2.227e+00
## BsmtQualNone -2.957921 1.156e+00 -2.559e+00
## BsmtQualTA -1.747012 6.239e-01 -2.800e+00
## MSZoningFV 5.896621 2.779e+01 2.122e-01
## MSZoningRH 5.650901 2.779e+01 2.034e-01
## MSZoningRL 5.686717 2.778e+01 2.047e-01
## MSZoningRM 4.697305 2.777e+01 1.691e-01
## HouseStyle1.5Unf -0.252072 1.044e+00 -2.414e-01
## HouseStyle1Story 0.856668 5.920e-01 1.447e+00
## HouseStyle2.5Fin 0.566512 1.555e+00 3.644e-01
## HouseStyle2.5Unf 0.802625 9.114e-01 8.806e-01
## HouseStyle2Story 0.227135 4.000e-01 5.678e-01
## HouseStyleSFoyer 1.966765 7.951e-01 2.474e+00
## HouseStyleSLvl 1.631209 6.533e-01 2.497e+00
## KitchenQualFa -2.783300 8.859e-01 -3.142e+00
## KitchenQualGd -1.730244 6.662e-01 -2.597e+00
## KitchenQualTA -2.066498 6.803e-01 -3.038e+00
##
## Intercepts:
## Value Std. Error t value
## Very Low|Low -1.459390e+01 2.320240e+01 -6.290000e-01
## Low|Medium -1.064280e+01 2.320240e+01 -4.587000e-01
## Medium|High -6.808000e+00 2.320110e+01 -2.934000e-01
## High|Very High -2.130500e+00 2.320170e+01 -9.180000e-02
##
## Residual Deviance: 1283.694
## AIC: 1441.694
In this first output we obtain the coefficients with the standard error and t statistic. This value is also done on the different Sale Prices Categories, it compares the log odds of being in a category respect to the reference category. For example, High/Very High is the odds of being in the High category compared to being in the Very High category.
Residual variance is a measure of how well the data is fitted by the model, we want this number to be as low as possible. This number may be useful in the future if we compare the residual variance to other models in order to determine the best model. The AIC is a measure of the quality of the model, the lower the better.
Making the model with cross validation, it already applies regularization.
# For reproducibility purposes
set.seed(123)
# Creating cross validation
train_control <- trainControl(
method = "cv",
number = 5
)
# Training the multinomial model with cross-validation
suppressWarnings({
ordinal_model <- train(SalePrice_Category ~ ., data = data_enc, method = "polr", trControl = train_control)
})
# Display the results
print(ordinal_model)
## Ordered Logistic or Probit Regression
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 894, 895, 896, 896, 895
## Resampling results across tuning parameters:
##
## method Accuracy Kappa
## cauchit 0.7405109 0.6756398
## cloglog 0.7309417 0.6637684
## logistic 0.7444046 0.6805022
## loglog 0.6988164 0.6234341
## probit 0.7381385 0.6726532
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was method = logistic.
# summary(ordinal_model)
We see that there are 5 different link models which use different methods and functions to determine the cumulative properties of the response variables. Each of these methods behave better depending on the distribution of data, in our case we expect Probit or Logit to be the best method as it assumes a normal distribution. However we see that all methods return really similar values which means they perform really well.
As expected, we obtained a higher value than Multinomial regression, we obtained an accuracy of around 0.72 and a kappa of ~0.66 which shows improvement from previous models.
Following the same procedure as with other algorithms, we will now use PCA to obtain a 2D plot of the classification made by the algorithm.
pca_data$Predicted_Class_olr <- predict(ordinal_model, newdata = data_enc)
# Visualize the PCA result with predicted class labels
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_olr, shape=SalePrice_Category)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Ordinal Logistic Regression Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_olr)))) # Customize colors for predicted classes
We will now plot the ROC curve, we have a problem as our we have multivariable classification. In binary classification there is one class(positive) and another (negative), therefore to plot the ROC curve we will use One-vs-Rest approach where for each class, one of the classes is considered as the positive class and the remaning classes are considered the negative class. Therefore we will have 5 different roc curves, the ROC measures the True Positive Rate of the model (proportion of positive values actually classified) and False Positive Rate (proportion of misclassified negatives)
We calculate this ROC curve for the model.
# For reproducibility purposes
set.seed(123)
# Get predicted probabilities
pred_probs_ord <- predict(ordinal_model, newdata = data_enc, type = "prob")
# True labels (SalePrice_Category)
true_labels <- data_enc$SalePrice_Category
# Create a list to store ROC curves for each class (One-vs-Rest)
roc_list_ord <- list()
# Get the levels of the true labels (class names)
class_levels <- levels(true_labels)
# Calculate ROC curve for each class using One-vs-Rest approach
for (i in 1:length(class_levels)) {
# Treat the current class as positive (1) and all others as negative (0)
binary_true_labels <- ifelse(true_labels == class_levels[i], 1, 0)
# Calculate ROC curve for the current class
roc_list_ord[[i]] <- roc(binary_true_labels, pred_probs_ord[, i], quiet = TRUE)
}
# Plot ROC curves using ggroc
roc_plot_ord <- ggroc(roc_list_ord)
print(roc_list_ord)
## [[1]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = pred_probs_ord[, i], quiet = TRUE)
##
## Data: pred_probs_ord[, i] in 893 controls (binary_true_labels 0) < 226 cases (binary_true_labels 1).
## Area under the curve: 0.9681
##
## [[2]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = pred_probs_ord[, i], quiet = TRUE)
##
## Data: pred_probs_ord[, i] in 892 controls (binary_true_labels 0) < 227 cases (binary_true_labels 1).
## Area under the curve: 0.9058
##
## [[3]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = pred_probs_ord[, i], quiet = TRUE)
##
## Data: pred_probs_ord[, i] in 900 controls (binary_true_labels 0) < 219 cases (binary_true_labels 1).
## Area under the curve: 0.9049
##
## [[4]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = pred_probs_ord[, i], quiet = TRUE)
##
## Data: pred_probs_ord[, i] in 896 controls (binary_true_labels 0) < 223 cases (binary_true_labels 1).
## Area under the curve: 0.939
##
## [[5]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = pred_probs_ord[, i], quiet = TRUE)
##
## Data: pred_probs_ord[, i] in 895 controls (binary_true_labels 0) < 224 cases (binary_true_labels 1).
## Area under the curve: 0.9906
print(roc_plot_ord)
The name labels correspond to each of the classes Very Low, Low, Medium, High, Very High ordered from 1 to 5. At 1, the positive class is considered to be Very Low and the rest of classes are considered to be negative, and so on.
We also have to take into account that the ROC values are made with respect to the training data therefore as this data is used for training we will usually obtain higher values as the model has been optimized to perform well on it.
On the first output we see that the Area under the curve for the all the different ROC curves is quite high (around 0.9) which means the model distinguishes well between classes.
On the plot, the closer the ROC is to the top left corner the better the model will be for distinguishing between classes, clearly the Very High prices are the best prices distinguished followed by Very Low, we see that the model has very clear the difference between these classes with respect to the others.
However we see that the model has more trouble for differentiating Medium and Low prices in relation to the rest of prices, which is where we have most of the error.
The next algorithm we will use is SVMs. They are a model that has many applications, but one of the most common is for supervised classification. Its main idea in plain English is to create a hyper-plane that divides each class while maximizing the margin between this hyper-plane and the two classes. Since the classes are not always separable like this, we use the hinge loss function to penalize misclassication. Mathematically:
\[ \mathcal{L}_i = max(0, 1-y_i(\mathbf{w}^T\mathbf{x_i}-b)) \]
Where \(\mathbf{w}\) are the weights of the hyper-plane, \(x_i\) the features of the observation \(i\), \(b\) the intercept of the hyper-plane and \(y_i\) the target for that observation. With this we see, that when \(\mathbf{w}^T\mathbf{x_i}-b\) is very big and has the opposite sign of \(y_i\) the loss becomes really big. We then have the following minimization problem:
\[ \min_{\mathbf{w}, b} \, C \sum_{i=1}^n \mathcal{L}_i + \frac{1}{2} \mathbf{w}^T \mathbf{w} \]
Where \(C\) is a hyper-parameter that regulates the trade-off between maximizing the margin and minimizing the hinge loss. This formula maximizes the wideness of the margin, which is given by \(\dfrac{2}{\mathbf{w}^T\mathbf{w}}\), while also minimizing the loss function \(\mathcal{L}\).
This is a very powerful algorithm, but due to its complexity its use case is for small to medium size datasets which is exactly what we have. Therefore we expect a high improvement in accuracy.
We will first use Linear SVM before moving to more complex methods, which assumes that the classes are linearly separable.
# For reproducibility purposes
set.seed(123)
# Set up train control for cross-validation
train_control <- trainControl(method = "cv", number = 5)
# Train SVM with a linear kernel
svm_linear_model <- train(SalePrice_Category ~ .,
data = data_enc,
method = "svmLinear",
trControl = train_control)
# Print the results
print(svm_linear_model)
## Support Vector Machines with Linear Kernel
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 894, 895, 896, 896, 895
## Resampling results:
##
## Accuracy Kappa
## 0.7024439 0.6280197
##
## Tuning parameter 'C' was held constant at a value of 1
As we expected there is a higher accuracy (~70%) and kappa (~0.63) compared to the previous models. But we know this can be improved, the following way. Before we said that Linear SVM assumes the data is linearly separable but this is not always the case. To solve this, we use the Kernel SVM that augments the dimensionality of the data with a Kernel function in order to do non-linear classification. The kernel function we will use is Gaussian RBF, since is one of the most common and versatile ones. We will also perform a small hyper-parameter tuning to optimize the results further.
# For reproducibility purposes
set.seed(123)
# Define a grid of hyperparameters to tune
grid <- expand.grid(C = seq(0.7, 2.5, by = 0.1), sigma = seq(0.01, 0.1, by = 0.01))
# Train SVM with grid search
svm_kernel <- train(SalePrice_Category ~ .,
data = data_enc,
method = "svmRadial",
trControl = train_control,
tuneGrid = grid)
# Print the best model
print(svm_kernel)
## Support Vector Machines with Radial Basis Function Kernel
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 894, 895, 896, 896, 895
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 0.7 0.01 0.7032968 0.6290485
## 0.7 0.02 0.7149439 0.6436306
## 0.7 0.03 0.7068683 0.6335203
## 0.7 0.04 0.7149120 0.6435665
## 0.7 0.05 0.7131183 0.6413243
## 0.7 0.06 0.7131263 0.6413298
## 0.7 0.07 0.7122175 0.6401834
## 0.7 0.08 0.7149001 0.6435349
## 0.7 0.09 0.7095309 0.6368208
## 0.7 0.10 0.7077492 0.6345679
## 0.8 0.01 0.7122414 0.6402577
## 0.8 0.02 0.7131542 0.6413916
## 0.8 0.03 0.7122294 0.6402113
## 0.8 0.04 0.7113366 0.6390862
## 0.8 0.05 0.7140112 0.6424386
## 0.8 0.06 0.7175946 0.6469016
## 0.8 0.07 0.7166778 0.6457649
## 0.8 0.08 0.7086261 0.6356951
## 0.8 0.09 0.7086261 0.6356887
## 0.8 0.10 0.7077492 0.6345752
## 0.9 0.01 0.7113566 0.6391588
## 0.9 0.02 0.7184994 0.6480570
## 0.9 0.03 0.7149280 0.6435807
## 0.9 0.04 0.7140191 0.6424436
## 0.9 0.05 0.7211701 0.6513845
## 0.9 0.06 0.7220510 0.6524868
## 0.9 0.07 0.7157929 0.6446633
## 0.9 0.08 0.7139952 0.6424126
## 0.9 0.09 0.7139912 0.6424034
## 0.9 0.10 0.7095109 0.6367920
## 1.0 0.01 0.7140431 0.6425064
## 1.0 0.02 0.7184955 0.6480569
## 1.0 0.03 0.7167297 0.6458342
## 1.0 0.04 0.7211781 0.6514054
## 1.0 0.05 0.7220709 0.6525125
## 1.0 0.06 0.7158049 0.6446715
## 1.0 0.07 0.7131023 0.6413033
## 1.0 0.08 0.7130824 0.6412667
## 1.0 0.09 0.7148721 0.6435044
## 1.0 0.10 0.7130944 0.6412641
## 1.1 0.01 0.7149400 0.6436113
## 1.1 0.02 0.7140272 0.6424618
## 1.1 0.03 0.7211980 0.6514317
## 1.1 0.04 0.7211781 0.6514009
## 1.1 0.05 0.7211781 0.6513964
## 1.1 0.06 0.7157889 0.6446495
## 1.1 0.07 0.7175507 0.6468627
## 1.1 0.08 0.7148721 0.6435084
## 1.1 0.09 0.7148801 0.6435007
## 1.1 0.10 0.7175667 0.6468535
## 1.2 0.01 0.7149480 0.6436104
## 1.2 0.02 0.7176106 0.6469368
## 1.2 0.03 0.7238646 0.6547631
## 1.2 0.04 0.7184875 0.6480402
## 1.2 0.05 0.7202573 0.6502378
## 1.2 0.06 0.7193643 0.6491231
## 1.2 0.07 0.7211301 0.6513350
## 1.2 0.08 0.7193444 0.6490916
## 1.2 0.09 0.7157849 0.6446246
## 1.2 0.10 0.7193524 0.6490918
## 1.3 0.01 0.7185274 0.6480916
## 1.3 0.02 0.7185075 0.6480600
## 1.3 0.03 0.7256544 0.6570022
## 1.3 0.04 0.7211581 0.6513758
## 1.3 0.05 0.7202652 0.6502525
## 1.3 0.06 0.7193604 0.6491151
## 1.3 0.07 0.7238087 0.6546830
## 1.3 0.08 0.7202413 0.6502129
## 1.3 0.09 0.7184555 0.6479654
## 1.3 0.10 0.7184476 0.6479519
## 1.4 0.01 0.7212020 0.6514378
## 1.4 0.02 0.7211980 0.6514223
## 1.4 0.03 0.7256503 0.6569966
## 1.4 0.04 0.7220550 0.6524954
## 1.4 0.05 0.7238287 0.6547043
## 1.4 0.06 0.7193444 0.6491042
## 1.4 0.07 0.7264953 0.6580435
## 1.4 0.08 0.7238127 0.6546783
## 1.4 0.09 0.7175507 0.6468424
## 1.4 0.10 0.7148761 0.6434852
## 1.5 0.01 0.7176225 0.6469621
## 1.5 0.02 0.7211980 0.6514343
## 1.5 0.03 0.7283170 0.6603337
## 1.5 0.04 0.7238408 0.6547252
## 1.5 0.05 0.7238327 0.6547088
## 1.5 0.06 0.7211342 0.6513403
## 1.5 0.07 0.7264953 0.6580430
## 1.5 0.08 0.7220270 0.6524516
## 1.5 0.09 0.7157650 0.6446039
## 1.5 0.10 0.7130944 0.6412520
## 1.6 0.01 0.7194163 0.6492120
## 1.6 0.02 0.7238766 0.6547898
## 1.6 0.03 0.7256304 0.6569862
## 1.6 0.04 0.7238407 0.6547242
## 1.6 0.05 0.7229358 0.6535872
## 1.6 0.06 0.7220350 0.6524687
## 1.6 0.07 0.7264874 0.6580322
## 1.6 0.08 0.7202453 0.6502247
## 1.6 0.09 0.7148641 0.6434769
## 1.6 0.10 0.7131063 0.6412682
## 1.7 0.01 0.7212099 0.6514613
## 1.7 0.02 0.7247734 0.6559111
## 1.7 0.03 0.7282971 0.6603155
## 1.7 0.04 0.7265233 0.6580811
## 1.7 0.05 0.7202532 0.6502369
## 1.7 0.06 0.7229199 0.6535762
## 1.7 0.07 0.7247017 0.6558018
## 1.7 0.08 0.7175627 0.6468662
## 1.7 0.09 0.7157729 0.6446150
## 1.7 0.10 0.7140031 0.6423890
## 1.8 0.01 0.7265711 0.6581619
## 1.8 0.02 0.7247695 0.6559132
## 1.8 0.03 0.7256225 0.6569793
## 1.8 0.04 0.7256304 0.6569659
## 1.8 0.05 0.7229358 0.6535951
## 1.8 0.06 0.7220270 0.6524603
## 1.8 0.07 0.7211262 0.6513288
## 1.8 0.08 0.7193524 0.6491057
## 1.8 0.09 0.7157809 0.6446302
## 1.8 0.10 0.7122134 0.6401517
## 1.9 0.01 0.7256783 0.6570496
## 1.9 0.02 0.7256663 0.6570345
## 1.9 0.03 0.7292059 0.6614643
## 1.9 0.04 0.7229478 0.6536143
## 1.9 0.05 0.7229358 0.6535972
## 1.9 0.06 0.7247137 0.6558200
## 1.9 0.07 0.7238088 0.6546831
## 1.9 0.08 0.7184635 0.6479852
## 1.9 0.09 0.7139992 0.6424008
## 1.9 0.10 0.7122134 0.6401477
## 2.0 0.01 0.7292498 0.6615231
## 2.0 0.02 0.7238686 0.6547819
## 2.0 0.03 0.7300988 0.6625838
## 2.0 0.04 0.7256304 0.6569729
## 2.0 0.05 0.7238407 0.6547266
## 2.0 0.06 0.7282891 0.6602993
## 2.0 0.07 0.7211421 0.6513451
## 2.0 0.08 0.7166778 0.6457530
## 2.0 0.09 0.7113126 0.6390376
## 2.0 0.10 0.7113166 0.6390222
## 2.1 0.01 0.7265752 0.6581709
## 2.1 0.02 0.7238647 0.6547755
## 2.1 0.03 0.7292099 0.6614722
## 2.1 0.04 0.7265273 0.6580944
## 2.1 0.05 0.7274042 0.6591895
## 2.1 0.06 0.7282851 0.6602891
## 2.1 0.07 0.7229398 0.6535820
## 2.1 0.08 0.7149000 0.6435261
## 2.1 0.09 0.7122134 0.6401563
## 2.1 0.10 0.7104277 0.6379165
## 2.2 0.01 0.7256823 0.6570597
## 2.2 0.02 0.7256384 0.6569983
## 2.2 0.03 0.7301068 0.6625901
## 2.2 0.04 0.7238367 0.6547363
## 2.2 0.05 0.7274042 0.6591914
## 2.2 0.06 0.7264914 0.6580427
## 2.2 0.07 0.7220470 0.6524677
## 2.2 0.08 0.7166817 0.6457459
## 2.2 0.09 0.7122094 0.6401514
## 2.2 0.10 0.7059514 0.6323265
## 2.3 0.01 0.7247895 0.6559436
## 2.3 0.02 0.7247296 0.6558626
## 2.3 0.03 0.7301108 0.6625971
## 2.3 0.04 0.7256225 0.6569707
## 2.3 0.05 0.7273962 0.6591797
## 2.3 0.06 0.7264954 0.6580486
## 2.3 0.07 0.7202572 0.6502232
## 2.3 0.08 0.7148920 0.6435080
## 2.3 0.09 0.7131063 0.6412749
## 2.3 0.10 0.7050585 0.6312108
## 2.4 0.01 0.7221148 0.6525987
## 2.4 0.02 0.7265233 0.6581101
## 2.4 0.03 0.7328013 0.6659546
## 2.4 0.04 0.7256225 0.6569733
## 2.4 0.05 0.7274042 0.6591923
## 2.4 0.06 0.7265034 0.6580535
## 2.4 0.07 0.7166818 0.6457520
## 2.4 0.08 0.7157849 0.6446281
## 2.4 0.09 0.7095228 0.6367993
## 2.4 0.10 0.7077451 0.6345705
## 2.5 0.01 0.7221148 0.6525987
## 2.5 0.02 0.7265313 0.6581184
## 2.5 0.03 0.7328013 0.6659546
## 2.5 0.04 0.7292059 0.6614615
## 2.5 0.05 0.7282931 0.6603087
## 2.5 0.06 0.7264994 0.6580447
## 2.5 0.07 0.7166778 0.6457446
## 2.5 0.08 0.7166817 0.6457498
## 2.5 0.09 0.7104157 0.6379137
## 2.5 0.10 0.7059594 0.6323380
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03 and C = 2.4.
After this implementation of SVMs and the hyper-parameter tuning, we get a ~73% accuracy and ~0.66 kappa which is the highes yet for any model. This is a slight improvement over the Linear SVM model we trained before too.
SVMs are black-box models, meaning that it is difficult to know how the predictions have been done, and therefore interpret our results. Because of this we will use the common plot that we have been doing all over the project to observe how the predictions compare to the real values.
# Get predicted class labels using the trained KNN model
pca_data$Predicted_Class_SVM <- predict(svm_kernel, newdata = data_enc)
# Visualize the PCA result with predicted class labels
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_SVM, shape=SalePrice_Category)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "SVM Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_SVM))))
Yet again we see the same trend, of difficulty classifying the middle classes, but there is a difference. In this case the class “Medium” has been mostly classified correctly compared with other models. We also observe that overall the classes have been classified correctly for the most part, hence the high accuracy.
The next algorithm we will see are Random Forests. Random Forest are constructed with many decision trees which were explained previously, they combine bagging (training the model with different bootstrap samples) and random feature selection to obtain a model.
Firstly, it draws \(x\) bootstrap samples of size \(n\), there is a decision tree for each bootstrap sample. There is a random set of features chosen at each node and the best feature to split on the node is chosen. As our focus is on classification, random forests will select the most popular vote over all the trees to obtain the final prediction.
They are hard to interpret but in exchange they are very accurate and they reduce variance and overfitting.
Random forests have parameters such as ntree which corresponds to the number of trees created and mtry which corresponds to the number of features selected at each split node. These are some parameters amongs others which could affect the accuracy of the model. Same as decision trees, Random Forests will support categorical data. Firstly, fitting the model.
# For reproducibility purposes
set.seed(123)
# Fit the Random Forest model
rf_model1 <- randomForest(SalePrice_Category ~ .,
data = data_not_enc,
importance = TRUE)
# Print model summary
print(rf_model1)
##
## Call:
## randomForest(formula = SalePrice_Category ~ ., data = data_not_enc, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 30.29%
## Confusion matrix:
## Very Low Low Medium High Very High class.error
## Very Low 174 46 6 0 0 0.2300885
## Low 42 145 37 3 0 0.3612335
## Medium 1 63 118 35 2 0.4611872
## High 1 11 37 151 23 0.3228700
## Very High 0 0 3 29 192 0.1428571
In the output we see we have OOB estimate of error rate which corresponds to out of bag samples. These are the points that were not selected during bootstrapping. After training the model, each tree can make predictions for the corresponding points, providing an estimate of the performance with unseen data. We can see the value is not really good as we want it to be as low as possible and we have a lot of error.
Moreover, we can see the confusion matrix on the classification. One good thing is that the differences between very high and very low are clearly separated.However there are some low points classified as high and some high classified as low which is not a really bad problem. There is a bigger error in classifying the medium price which is where most points are miss classified.
# For reproducibility purposes
set.seed(123)
# Set up train control for cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Training Random Forests
rf_model <- train(SalePrice_Category ~ .,
data = data_not_enc,
method = "rf",
ntree = 500,
tuneGrid = expand.grid(mtry = c(4,5,6)),
maximize = T,
trControl = train_control)
# Print the results
print(rf_model)
## Random Forest
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very Low', 'Low', 'Medium', 'High', 'Very High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1006, 1008, 1007, 1008, 1006, 1008, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 4 0.6890559 0.6112231
## 5 0.6863455 0.6078682
## 6 0.6838182 0.6047218
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
The best accuracy obtained was of ~0.68 with a kappa of 0.63 which is good compared to other models but there is not a really big improvement.
Obtaining a feature importance plot, the different variables are classified according to the Gini index from the decision trees.
# Extracting feature importances
importance_df <- data.frame(
Feature = row.names(importance(rf_model1)),
Importance = importance(rf_model1)[, 'MeanDecreaseGini']
)
# Plotting Feature Importances
ggplot(importance_df,
aes(x = reorder(Feature, Importance),
y = Importance)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
coord_flip() +
ggtitle('Feature Importances from Random Forest') +
xlab('') +
ylab('Mean Decrease in Gini') +
theme_minimal()
On the one hand, clearly the most relevant features are related with area and quality, neighborhood also seems to be pretty significant. This makes sense, as the larger the area and the better the location the bigger the price will be, and viceversa. On the other hand, other features such as mszoning and bldg type are not as relevant as the rest of variables but they do provide some useful information.
We will now use PCA to plot the classification.
pca_data$Predicted_Class_rf <- predict(rf_model, newdata = data_not_enc)
# Visualize the PCA result with predicted class labels
ggplot(pca_data, aes(x = PC1, y = PC2, color = Predicted_Class_rf, shape=SalePrice_Category)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Random Forest Predictions in 2D (PCA)", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(pca_data$Predicted_Class_rf)))) # Customize colors for predicted classes
We can see a big difference with respect to “geometry-based” models (e.g SVM, KNN…), since the model predicts well categories surrounded by another category, for example when “Very Low” is close to “Low” labeled points. This is really helpful, since for a future ensemble method, using models that complement the others’ weaknesses is one of the more robust approaches.
We will now do a partial dependence plot, to see how each variable impacts the classification of one specific category.
# Generate Partial Dependence Plot for 'SalePrice_Category'
pdp_plot <- partial(rf_model, pred.var = "OverallQual", grid.resolution = 20, prob = TRUE, which.class = 3)
# Plot the Partial Dependence Plot
autoplot(pdp_plot, rug = TRUE, train = data_not_enc) +
ggtitle('Partial Dependence Plot for Overall Quality') +
xlab('Overall Quality') +
ylab('Predicted Probability of Medium Price') +
theme_minimal()
# Generate Partial Dependence Plot for 'SalePrice_Category'
pdp_plot <- partial(rf_model, pred.var = "Neighborhood", grid.resolution = 20, prob = TRUE, which.class = 5)
# Plot the Partial Dependence Plot
autoplot(pdp_plot, rug = TRUE, train = data_not_enc) +
ggtitle('Partial Dependence Plot for each Neighborhood') +
xlab('Neighbourhood') +
ylab('Predicted Probability of Very High Price') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
In the first plot we see how OverallQual impacts the house labeled as “Medium” for price. We observe that values centered around 0, the mean of the this column, have higher probabilities of being classified as “Medium”. This makes sense, since houses with an average quality should neither be expensive nor cheap for the most part.
In the second plot we see how each neighborhood impacts the houses labeled as “Very High”. We see that neighborhood such as NoRidge and NridgHt, seem to have the most positive impact, meaning this are the “expensive” neighborhoods of the city.
Gradient Boosting is an ensemble method used for regression and classification. It builds a strong classifier based on a set of weak learners (decision trees). Although it is computationally expensive and prone to overfitting, models based on this architecture usually are the best performers in many problems, also because modern implementations have mitigated these problems.
Mathematically, what we do is train an ensemble of trees stage-wise and minimizing the loss function iteration over iteration. The typical loss function for multi-class classification (our case) is the following, which is based on the negative log likelihood:
\[ \mathcal{L} = - \sum_{i=1}^{n} \sum_{k=1}^{K} \mathbb{1}(y_i = k) \log P(y_i = k \mid x_i) \]
Where \(1(y_i=k)\) is an indicator functions whose value is 1 if \(y_i=k\) and 0 otherwise. \(P(y_i=k \mid x_i)\) is the probability that the \(i\)-th observation belongs to group \(k\). This probability is calculated using the softmax function:
\[ P(y_i = k \mid x_i) = \dfrac{e^{F_k(x_i)}}{\sum_{j=1}^Ke^{F_j(x_i)}} \]
Where \(F_k(x)\) is the logit produced by the the model for class \(k\).
For each class, the model has a function \(F_k^{(t)}(x)\), which is updated over iterations, and the starting value could be just log-odds of the class priors. Then we compute the residuals (gradients) as following:
\[ g_{ik}^{(t)} = \frac{\partial \mathcal{L}}{\partial F_k(x_i)} = P(y_i = k \mid x_i) - \mathbb{1}(y_i = k) \]
With \(t\) being the iteration number. After this we fit a decision tree, \(h_k^{(t)}\), to predict the negative gradient, so we should end up with something like this: \(h_k^{(t)} \approx -g_{ik}^{(t)}\).
We finally update the model with the prediction of this new weak learner, we scale it by a learning rate \(\eta\). We do this to make a small-step in the right direction, so that we try to reach a local minimum after not just one operation. So we arrive to the final expression of the model:
\[ F_k^{(t+1)}(x) = F_k^{(t)}(x) + \eta h_k^{(t)} (x) \]
We end-up doing our prediction \(\hat{y}\), as the class with the highest probability in the prediction:
\[ \hat{y} = \arg\max\limits_{x}P(y=k \mid x) \]
Gradient Boosting has many specific algorithms with different implementations. One of the most common ones is XGBoost (eXtreme Gradient Boosting) and is the one we will firstly use. To estimate how our model performs we will use cross-validations. Furthermore, hyper-parameter tuning is crucial in Gradient Boosting so we will also perform it to choose the most optimal model.
# For reproducibility purposes
set.seed(123)
levels(data_not_enc$SalePrice_Category) <- make.names(levels(data_not_enc$SalePrice_Category))
levels(data_enc$SalePrice_Category) <- make.names(levels(data_enc$SalePrice_Category))
# Define cross-validation method
train_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary # Evaluate multiple classes
)
# Define the hyperparameter grid
grid <- expand.grid(
nrounds = c(50, 100, 200), # Number of boosting iterations
max_depth = c(3, 5), # Maximum tree depth
eta = c(0.01, 0.05, 0.1), # Learning rate
gamma = 1, # Minimum loss reduction
colsample_bytree = 0.8, # Subsample ratio of columns
min_child_weight = c(1, 3, 5), # Minimum sum of weights for a child node
subsample = 0.8 # Subsample ratio of training instances
)
xgb_model <- train(
SalePrice_Category ~ .,
data = data_not_enc,
method = "xgbTree",
trControl = train_control,
tuneGrid = grid,
metric = "Accuracy"
)
## [19:41:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:41:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:42:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:43:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:44:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
# Display the best model's parameters
cat("Best Model Hyperparameters:\n")
## Best Model Hyperparameters:
print(xgb_model$bestTune)
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 48 200 5 0.1 1 0.8 1 0.8
# Display the best accuracy during training (cross-validation)
max(xgb_model$results$Accuracy)
## [1] 0.6845465
We obtain with the best set of hyper-parameters an accuracy of ~69% which is surprising since XGBoost is one of the most robust models, and for example the random forest we trained before had ~68% accuracy which is similar.
So, just to check we will use another implementation of Gradient Boosting (GBM), and check the results to see if there is a significant improvement.
# For reproducibility purposes
set.seed(123)
# Set up trainControl for cross-validation
train_control <- trainControl(
method = "cv",
number = 5, # 5-fold cross-validation
classProbs = TRUE,
summaryFunction = multiClassSummary
)
# Define hyperparameter grid for tuning
gbm_grid <- expand.grid(
interaction.depth = c(3, 5), # Depth of the trees
n.trees = c(50, 100, 200, 500), # Number of trees
shrinkage = c(0.01, 0.05, 0.1), # Learning rate
n.minobsinnode = c(5, 10) # Minimum number of observations in a node
)
# Train the gradient boosting model using gbm
gbm_model <- train(
SalePrice_Category ~ .,
data = data_not_enc,
method = "gbm",
trControl = train_control,
tuneGrid = gbm_grid,
metric = "Accuracy"
)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0218
## 2 1.5935 nan 0.0100 0.0208
## 3 1.5784 nan 0.0100 0.0203
## 4 1.5634 nan 0.0100 0.0204
## 5 1.5485 nan 0.0100 0.0200
## 6 1.5338 nan 0.0100 0.0199
## 7 1.5200 nan 0.0100 0.0186
## 8 1.5068 nan 0.0100 0.0171
## 9 1.4941 nan 0.0100 0.0199
## 10 1.4805 nan 0.0100 0.0173
## 20 1.3688 nan 0.0100 0.0124
## 40 1.2036 nan 0.0100 0.0084
## 60 1.0855 nan 0.0100 0.0055
## 80 0.9961 nan 0.0100 0.0043
## 100 0.9231 nan 0.0100 0.0027
## 120 0.8637 nan 0.0100 0.0028
## 140 0.8138 nan 0.0100 0.0017
## 160 0.7748 nan 0.0100 0.0012
## 180 0.7396 nan 0.0100 0.0005
## 200 0.7077 nan 0.0100 0.0004
## 220 0.6805 nan 0.0100 0.0007
## 240 0.6559 nan 0.0100 0.0004
## 260 0.6335 nan 0.0100 -0.0000
## 280 0.6131 nan 0.0100 -0.0002
## 300 0.5956 nan 0.0100 0.0001
## 320 0.5793 nan 0.0100 0.0002
## 340 0.5641 nan 0.0100 -0.0003
## 360 0.5507 nan 0.0100 -0.0001
## 380 0.5371 nan 0.0100 -0.0002
## 400 0.5241 nan 0.0100 -0.0005
## 420 0.5128 nan 0.0100 -0.0002
## 440 0.5013 nan 0.0100 -0.0002
## 460 0.4905 nan 0.0100 -0.0002
## 480 0.4797 nan 0.0100 -0.0009
## 500 0.4698 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0219
## 2 1.5933 nan 0.0100 0.0220
## 3 1.5778 nan 0.0100 0.0202
## 4 1.5629 nan 0.0100 0.0191
## 5 1.5485 nan 0.0100 0.0206
## 6 1.5342 nan 0.0100 0.0182
## 7 1.5212 nan 0.0100 0.0177
## 8 1.5082 nan 0.0100 0.0178
## 9 1.4951 nan 0.0100 0.0186
## 10 1.4819 nan 0.0100 0.0161
## 20 1.3683 nan 0.0100 0.0126
## 40 1.2021 nan 0.0100 0.0086
## 60 1.0849 nan 0.0100 0.0061
## 80 0.9924 nan 0.0100 0.0037
## 100 0.9202 nan 0.0100 0.0030
## 120 0.8601 nan 0.0100 0.0024
## 140 0.8118 nan 0.0100 0.0021
## 160 0.7701 nan 0.0100 0.0006
## 180 0.7353 nan 0.0100 0.0010
## 200 0.7046 nan 0.0100 0.0010
## 220 0.6774 nan 0.0100 -0.0009
## 240 0.6539 nan 0.0100 0.0006
## 260 0.6328 nan 0.0100 0.0004
## 280 0.6136 nan 0.0100 0.0001
## 300 0.5957 nan 0.0100 -0.0004
## 320 0.5791 nan 0.0100 -0.0001
## 340 0.5644 nan 0.0100 -0.0001
## 360 0.5504 nan 0.0100 -0.0003
## 380 0.5378 nan 0.0100 -0.0004
## 400 0.5251 nan 0.0100 -0.0006
## 420 0.5132 nan 0.0100 0.0000
## 440 0.5017 nan 0.0100 -0.0007
## 460 0.4909 nan 0.0100 -0.0003
## 480 0.4809 nan 0.0100 -0.0003
## 500 0.4718 nan 0.0100 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0258
## 2 1.5899 nan 0.0100 0.0248
## 3 1.5714 nan 0.0100 0.0259
## 4 1.5528 nan 0.0100 0.0258
## 5 1.5346 nan 0.0100 0.0213
## 6 1.5187 nan 0.0100 0.0240
## 7 1.5012 nan 0.0100 0.0216
## 8 1.4859 nan 0.0100 0.0227
## 9 1.4696 nan 0.0100 0.0216
## 10 1.4541 nan 0.0100 0.0203
## 20 1.3197 nan 0.0100 0.0146
## 40 1.1248 nan 0.0100 0.0089
## 60 0.9887 nan 0.0100 0.0072
## 80 0.8883 nan 0.0100 0.0041
## 100 0.8098 nan 0.0100 0.0032
## 120 0.7474 nan 0.0100 0.0025
## 140 0.6949 nan 0.0100 0.0022
## 160 0.6514 nan 0.0100 0.0010
## 180 0.6135 nan 0.0100 0.0008
## 200 0.5816 nan 0.0100 -0.0000
## 220 0.5536 nan 0.0100 0.0002
## 240 0.5289 nan 0.0100 0.0001
## 260 0.5052 nan 0.0100 -0.0000
## 280 0.4849 nan 0.0100 -0.0001
## 300 0.4661 nan 0.0100 -0.0009
## 320 0.4486 nan 0.0100 -0.0004
## 340 0.4330 nan 0.0100 -0.0007
## 360 0.4175 nan 0.0100 -0.0005
## 380 0.4034 nan 0.0100 -0.0003
## 400 0.3904 nan 0.0100 -0.0006
## 420 0.3778 nan 0.0100 -0.0002
## 440 0.3661 nan 0.0100 -0.0003
## 460 0.3553 nan 0.0100 -0.0003
## 480 0.3450 nan 0.0100 -0.0004
## 500 0.3352 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0287
## 2 1.5897 nan 0.0100 0.0267
## 3 1.5705 nan 0.0100 0.0215
## 4 1.5532 nan 0.0100 0.0238
## 5 1.5363 nan 0.0100 0.0247
## 6 1.5187 nan 0.0100 0.0228
## 7 1.5017 nan 0.0100 0.0210
## 8 1.4860 nan 0.0100 0.0232
## 9 1.4701 nan 0.0100 0.0213
## 10 1.4557 nan 0.0100 0.0224
## 20 1.3221 nan 0.0100 0.0147
## 40 1.1289 nan 0.0100 0.0099
## 60 0.9950 nan 0.0100 0.0067
## 80 0.8935 nan 0.0100 0.0046
## 100 0.8161 nan 0.0100 0.0037
## 120 0.7553 nan 0.0100 0.0027
## 140 0.7035 nan 0.0100 0.0009
## 160 0.6607 nan 0.0100 0.0006
## 180 0.6241 nan 0.0100 0.0006
## 200 0.5921 nan 0.0100 0.0001
## 220 0.5647 nan 0.0100 0.0002
## 240 0.5387 nan 0.0100 -0.0001
## 260 0.5145 nan 0.0100 -0.0004
## 280 0.4944 nan 0.0100 -0.0002
## 300 0.4749 nan 0.0100 -0.0001
## 320 0.4574 nan 0.0100 -0.0000
## 340 0.4410 nan 0.0100 0.0000
## 360 0.4254 nan 0.0100 -0.0005
## 380 0.4116 nan 0.0100 -0.0003
## 400 0.3987 nan 0.0100 -0.0007
## 420 0.3866 nan 0.0100 -0.0006
## 440 0.3744 nan 0.0100 -0.0006
## 460 0.3631 nan 0.0100 -0.0010
## 480 0.3523 nan 0.0100 -0.0007
## 500 0.3426 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.0896
## 2 1.5363 nan 0.0500 0.0954
## 3 1.4676 nan 0.0500 0.0795
## 4 1.4113 nan 0.0500 0.0720
## 5 1.3574 nan 0.0500 0.0603
## 6 1.3100 nan 0.0500 0.0531
## 7 1.2688 nan 0.0500 0.0539
## 8 1.2291 nan 0.0500 0.0446
## 9 1.1926 nan 0.0500 0.0403
## 10 1.1594 nan 0.0500 0.0363
## 20 0.9322 nan 0.0500 0.0148
## 40 0.7121 nan 0.0500 0.0013
## 60 0.6027 nan 0.0500 0.0009
## 80 0.5296 nan 0.0500 -0.0017
## 100 0.4737 nan 0.0500 -0.0036
## 120 0.4284 nan 0.0500 -0.0009
## 140 0.3924 nan 0.0500 -0.0015
## 160 0.3613 nan 0.0500 -0.0030
## 180 0.3343 nan 0.0500 -0.0011
## 200 0.3106 nan 0.0500 -0.0027
## 220 0.2889 nan 0.0500 -0.0013
## 240 0.2703 nan 0.0500 -0.0018
## 260 0.2531 nan 0.0500 -0.0019
## 280 0.2373 nan 0.0500 -0.0012
## 300 0.2226 nan 0.0500 -0.0015
## 320 0.2094 nan 0.0500 -0.0017
## 340 0.1968 nan 0.0500 -0.0025
## 360 0.1859 nan 0.0500 -0.0017
## 380 0.1752 nan 0.0500 -0.0018
## 400 0.1659 nan 0.0500 -0.0007
## 420 0.1566 nan 0.0500 -0.0017
## 440 0.1491 nan 0.0500 -0.0013
## 460 0.1409 nan 0.0500 -0.0019
## 480 0.1336 nan 0.0500 -0.0022
## 500 0.1267 nan 0.0500 -0.0013
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1011
## 2 1.5335 nan 0.0500 0.0985
## 3 1.4655 nan 0.0500 0.0777
## 4 1.4067 nan 0.0500 0.0640
## 5 1.3572 nan 0.0500 0.0665
## 6 1.3073 nan 0.0500 0.0523
## 7 1.2672 nan 0.0500 0.0542
## 8 1.2277 nan 0.0500 0.0443
## 9 1.1918 nan 0.0500 0.0416
## 10 1.1585 nan 0.0500 0.0343
## 20 0.9275 nan 0.0500 0.0188
## 40 0.7096 nan 0.0500 0.0020
## 60 0.5985 nan 0.0500 -0.0026
## 80 0.5270 nan 0.0500 -0.0031
## 100 0.4754 nan 0.0500 -0.0040
## 120 0.4320 nan 0.0500 -0.0024
## 140 0.3959 nan 0.0500 -0.0034
## 160 0.3651 nan 0.0500 -0.0027
## 180 0.3368 nan 0.0500 -0.0032
## 200 0.3136 nan 0.0500 -0.0030
## 220 0.2908 nan 0.0500 -0.0028
## 240 0.2730 nan 0.0500 -0.0019
## 260 0.2562 nan 0.0500 -0.0021
## 280 0.2403 nan 0.0500 -0.0019
## 300 0.2245 nan 0.0500 -0.0021
## 320 0.2122 nan 0.0500 -0.0034
## 340 0.1999 nan 0.0500 -0.0027
## 360 0.1887 nan 0.0500 -0.0024
## 380 0.1788 nan 0.0500 -0.0020
## 400 0.1693 nan 0.0500 -0.0025
## 420 0.1609 nan 0.0500 -0.0017
## 440 0.1519 nan 0.0500 -0.0012
## 460 0.1444 nan 0.0500 -0.0020
## 480 0.1377 nan 0.0500 -0.0020
## 500 0.1312 nan 0.0500 -0.0023
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1207
## 2 1.5161 nan 0.0500 0.1116
## 3 1.4344 nan 0.0500 0.0964
## 4 1.3654 nan 0.0500 0.0818
## 5 1.3052 nan 0.0500 0.0717
## 6 1.2516 nan 0.0500 0.0596
## 7 1.2021 nan 0.0500 0.0597
## 8 1.1557 nan 0.0500 0.0400
## 9 1.1186 nan 0.0500 0.0516
## 10 1.0797 nan 0.0500 0.0456
## 20 0.8245 nan 0.0500 0.0103
## 40 0.5906 nan 0.0500 0.0020
## 60 0.4736 nan 0.0500 -0.0014
## 80 0.3931 nan 0.0500 -0.0029
## 100 0.3341 nan 0.0500 -0.0028
## 120 0.2904 nan 0.0500 -0.0034
## 140 0.2562 nan 0.0500 -0.0018
## 160 0.2263 nan 0.0500 -0.0023
## 180 0.2023 nan 0.0500 -0.0021
## 200 0.1819 nan 0.0500 -0.0016
## 220 0.1640 nan 0.0500 -0.0015
## 240 0.1479 nan 0.0500 -0.0022
## 260 0.1332 nan 0.0500 -0.0017
## 280 0.1204 nan 0.0500 -0.0020
## 300 0.1101 nan 0.0500 -0.0016
## 320 0.1007 nan 0.0500 -0.0012
## 340 0.0919 nan 0.0500 -0.0012
## 360 0.0838 nan 0.0500 -0.0008
## 380 0.0767 nan 0.0500 -0.0007
## 400 0.0700 nan 0.0500 -0.0009
## 420 0.0643 nan 0.0500 -0.0008
## 440 0.0593 nan 0.0500 -0.0007
## 460 0.0544 nan 0.0500 -0.0011
## 480 0.0503 nan 0.0500 -0.0009
## 500 0.0465 nan 0.0500 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1263
## 2 1.5151 nan 0.0500 0.1127
## 3 1.4319 nan 0.0500 0.0966
## 4 1.3645 nan 0.0500 0.0756
## 5 1.3074 nan 0.0500 0.0663
## 6 1.2533 nan 0.0500 0.0616
## 7 1.2053 nan 0.0500 0.0561
## 8 1.1609 nan 0.0500 0.0530
## 9 1.1194 nan 0.0500 0.0495
## 10 1.0802 nan 0.0500 0.0400
## 20 0.8242 nan 0.0500 0.0139
## 40 0.5999 nan 0.0500 0.0008
## 60 0.4814 nan 0.0500 -0.0033
## 80 0.4065 nan 0.0500 -0.0046
## 100 0.3519 nan 0.0500 -0.0049
## 120 0.3071 nan 0.0500 -0.0039
## 140 0.2703 nan 0.0500 -0.0027
## 160 0.2385 nan 0.0500 -0.0034
## 180 0.2133 nan 0.0500 -0.0026
## 200 0.1921 nan 0.0500 -0.0028
## 220 0.1727 nan 0.0500 -0.0022
## 240 0.1559 nan 0.0500 -0.0024
## 260 0.1406 nan 0.0500 -0.0023
## 280 0.1277 nan 0.0500 -0.0018
## 300 0.1155 nan 0.0500 -0.0018
## 320 0.1052 nan 0.0500 -0.0016
## 340 0.0961 nan 0.0500 -0.0016
## 360 0.0876 nan 0.0500 -0.0015
## 380 0.0801 nan 0.0500 -0.0012
## 400 0.0737 nan 0.0500 -0.0016
## 420 0.0679 nan 0.0500 -0.0009
## 440 0.0627 nan 0.0500 -0.0012
## 460 0.0578 nan 0.0500 -0.0010
## 480 0.0537 nan 0.0500 -0.0007
## 500 0.0496 nan 0.0500 -0.0011
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2164
## 2 1.4543 nan 0.1000 0.1459
## 3 1.3486 nan 0.1000 0.1296
## 4 1.2523 nan 0.1000 0.0840
## 5 1.1842 nan 0.1000 0.0589
## 6 1.1258 nan 0.1000 0.0591
## 7 1.0744 nan 0.1000 0.0482
## 8 1.0261 nan 0.1000 0.0432
## 9 0.9851 nan 0.1000 0.0307
## 10 0.9492 nan 0.1000 0.0251
## 20 0.7226 nan 0.1000 0.0069
## 40 0.5312 nan 0.1000 -0.0034
## 60 0.4246 nan 0.1000 -0.0070
## 80 0.3599 nan 0.1000 -0.0073
## 100 0.3086 nan 0.1000 -0.0069
## 120 0.2714 nan 0.1000 -0.0055
## 140 0.2380 nan 0.1000 -0.0085
## 160 0.2114 nan 0.1000 -0.0046
## 180 0.1882 nan 0.1000 -0.0024
## 200 0.1696 nan 0.1000 -0.0054
## 220 0.1526 nan 0.1000 -0.0038
## 240 0.1362 nan 0.1000 -0.0034
## 260 0.1238 nan 0.1000 -0.0014
## 280 0.1116 nan 0.1000 -0.0019
## 300 0.1016 nan 0.1000 -0.0031
## 320 0.0926 nan 0.1000 -0.0021
## 340 0.0851 nan 0.1000 -0.0022
## 360 0.0786 nan 0.1000 -0.0012
## 380 0.0715 nan 0.1000 -0.0011
## 400 0.0657 nan 0.1000 -0.0011
## 420 0.0606 nan 0.1000 -0.0026
## 440 0.0558 nan 0.1000 -0.0012
## 460 0.0518 nan 0.1000 -0.0011
## 480 0.0480 nan 0.1000 -0.0013
## 500 0.0451 nan 0.1000 -0.0010
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1811
## 2 1.4606 nan 0.1000 0.1470
## 3 1.3494 nan 0.1000 0.1208
## 4 1.2547 nan 0.1000 0.0888
## 5 1.1803 nan 0.1000 0.0679
## 6 1.1211 nan 0.1000 0.0656
## 7 1.0687 nan 0.1000 0.0569
## 8 1.0204 nan 0.1000 0.0404
## 9 0.9823 nan 0.1000 0.0363
## 10 0.9451 nan 0.1000 0.0219
## 20 0.7234 nan 0.1000 0.0045
## 40 0.5385 nan 0.1000 -0.0069
## 60 0.4426 nan 0.1000 -0.0078
## 80 0.3698 nan 0.1000 -0.0086
## 100 0.3192 nan 0.1000 -0.0074
## 120 0.2807 nan 0.1000 -0.0074
## 140 0.2496 nan 0.1000 -0.0058
## 160 0.2238 nan 0.1000 -0.0042
## 180 0.2005 nan 0.1000 -0.0042
## 200 0.1791 nan 0.1000 -0.0062
## 220 0.1604 nan 0.1000 -0.0030
## 240 0.1431 nan 0.1000 -0.0039
## 260 0.1294 nan 0.1000 -0.0026
## 280 0.1180 nan 0.1000 -0.0021
## 300 0.1067 nan 0.1000 -0.0027
## 320 0.0974 nan 0.1000 -0.0030
## 340 0.0894 nan 0.1000 -0.0035
## 360 0.0816 nan 0.1000 -0.0023
## 380 0.0745 nan 0.1000 -0.0023
## 400 0.0682 nan 0.1000 -0.0023
## 420 0.0633 nan 0.1000 -0.0005
## 440 0.0580 nan 0.1000 -0.0020
## 460 0.0533 nan 0.1000 -0.0014
## 480 0.0491 nan 0.1000 -0.0012
## 500 0.0455 nan 0.1000 -0.0022
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2389
## 2 1.4252 nan 0.1000 0.1651
## 3 1.2932 nan 0.1000 0.1045
## 4 1.1971 nan 0.1000 0.0976
## 5 1.1127 nan 0.1000 0.0811
## 6 1.0393 nan 0.1000 0.0588
## 7 0.9840 nan 0.1000 0.0612
## 8 0.9262 nan 0.1000 0.0406
## 9 0.8823 nan 0.1000 0.0350
## 10 0.8422 nan 0.1000 0.0307
## 20 0.5975 nan 0.1000 -0.0006
## 40 0.4063 nan 0.1000 -0.0067
## 60 0.3036 nan 0.1000 -0.0053
## 80 0.2386 nan 0.1000 -0.0060
## 100 0.1915 nan 0.1000 -0.0043
## 120 0.1572 nan 0.1000 -0.0057
## 140 0.1285 nan 0.1000 -0.0042
## 160 0.1056 nan 0.1000 -0.0030
## 180 0.0886 nan 0.1000 -0.0019
## 200 0.0742 nan 0.1000 -0.0001
## 220 0.0627 nan 0.1000 -0.0024
## 240 0.0533 nan 0.1000 -0.0013
## 260 0.0455 nan 0.1000 -0.0014
## 280 0.0394 nan 0.1000 -0.0015
## 300 0.0339 nan 0.1000 -0.0014
## 320 0.0296 nan 0.1000 -0.0008
## 340 0.0258 nan 0.1000 -0.0006
## 360 0.0225 nan 0.1000 -0.0010
## 380 0.0200 nan 0.1000 -0.0013
## 400 0.0178 nan 0.1000 -0.0004
## 420 0.0156 nan 0.1000 -0.0004
## 440 0.0137 nan 0.1000 -0.0005
## 460 0.0124 nan 0.1000 -0.0013
## 480 0.0110 nan 0.1000 -0.0010
## 500 0.0099 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2422
## 2 1.4232 nan 0.1000 0.1665
## 3 1.2924 nan 0.1000 0.1161
## 4 1.1968 nan 0.1000 0.1018
## 5 1.1113 nan 0.1000 0.0887
## 6 1.0372 nan 0.1000 0.0593
## 7 0.9781 nan 0.1000 0.0466
## 8 0.9284 nan 0.1000 0.0437
## 9 0.8834 nan 0.1000 0.0335
## 10 0.8480 nan 0.1000 0.0310
## 20 0.6131 nan 0.1000 -0.0058
## 40 0.4155 nan 0.1000 -0.0131
## 60 0.3122 nan 0.1000 -0.0130
## 80 0.2426 nan 0.1000 -0.0041
## 100 0.1965 nan 0.1000 -0.0068
## 120 0.1605 nan 0.1000 -0.0034
## 140 0.1298 nan 0.1000 -0.0048
## 160 0.1082 nan 0.1000 -0.0028
## 180 0.0893 nan 0.1000 -0.0020
## 200 0.0744 nan 0.1000 -0.0021
## 220 0.0628 nan 0.1000 -0.0022
## 240 0.0533 nan 0.1000 -0.0016
## 260 0.0456 nan 0.1000 -0.0026
## 280 0.0396 nan 0.1000 -0.0010
## 300 0.0346 nan 0.1000 -0.0011
## 320 0.0298 nan 0.1000 -0.0013
## 340 0.0259 nan 0.1000 -0.0007
## 360 0.0228 nan 0.1000 -0.0008
## 380 0.0201 nan 0.1000 -0.0006
## 400 0.0176 nan 0.1000 -0.0014
## 420 0.0156 nan 0.1000 -0.0005
## 440 0.0139 nan 0.1000 -0.0016
## 460 0.0124 nan 0.1000 -0.0015
## 480 0.0111 nan 0.1000 -0.0011
## 500 0.0100 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0240
## 2 1.5929 nan 0.0100 0.0177
## 3 1.5791 nan 0.0100 0.0207
## 4 1.5648 nan 0.0100 0.0207
## 5 1.5509 nan 0.0100 0.0179
## 6 1.5370 nan 0.0100 0.0215
## 7 1.5234 nan 0.0100 0.0188
## 8 1.5098 nan 0.0100 0.0169
## 9 1.4970 nan 0.0100 0.0174
## 10 1.4845 nan 0.0100 0.0184
## 20 1.3724 nan 0.0100 0.0149
## 40 1.2037 nan 0.0100 0.0076
## 60 1.0851 nan 0.0100 0.0059
## 80 0.9934 nan 0.0100 0.0053
## 100 0.9227 nan 0.0100 0.0031
## 120 0.8634 nan 0.0100 0.0024
## 140 0.8156 nan 0.0100 0.0020
## 160 0.7744 nan 0.0100 0.0019
## 180 0.7395 nan 0.0100 0.0012
## 200 0.7082 nan 0.0100 0.0010
## 220 0.6815 nan 0.0100 0.0003
## 240 0.6574 nan 0.0100 0.0002
## 260 0.6354 nan 0.0100 -0.0001
## 280 0.6153 nan 0.0100 -0.0003
## 300 0.5968 nan 0.0100 0.0001
## 320 0.5798 nan 0.0100 -0.0007
## 340 0.5644 nan 0.0100 0.0001
## 360 0.5503 nan 0.0100 0.0002
## 380 0.5368 nan 0.0100 -0.0004
## 400 0.5237 nan 0.0100 0.0002
## 420 0.5113 nan 0.0100 -0.0005
## 440 0.5004 nan 0.0100 -0.0002
## 460 0.4896 nan 0.0100 -0.0006
## 480 0.4797 nan 0.0100 -0.0005
## 500 0.4704 nan 0.0100 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0220
## 2 1.5941 nan 0.0100 0.0202
## 3 1.5787 nan 0.0100 0.0219
## 4 1.5630 nan 0.0100 0.0210
## 5 1.5484 nan 0.0100 0.0202
## 6 1.5344 nan 0.0100 0.0189
## 7 1.5206 nan 0.0100 0.0211
## 8 1.5067 nan 0.0100 0.0173
## 9 1.4933 nan 0.0100 0.0172
## 10 1.4807 nan 0.0100 0.0175
## 20 1.3694 nan 0.0100 0.0146
## 40 1.2040 nan 0.0100 0.0084
## 60 1.0867 nan 0.0100 0.0062
## 80 0.9963 nan 0.0100 0.0039
## 100 0.9242 nan 0.0100 0.0026
## 120 0.8660 nan 0.0100 0.0019
## 140 0.8166 nan 0.0100 0.0014
## 160 0.7778 nan 0.0100 0.0014
## 180 0.7420 nan 0.0100 0.0012
## 200 0.7124 nan 0.0100 0.0005
## 220 0.6860 nan 0.0100 0.0001
## 240 0.6617 nan 0.0100 0.0008
## 260 0.6394 nan 0.0100 -0.0003
## 280 0.6201 nan 0.0100 -0.0001
## 300 0.6020 nan 0.0100 0.0002
## 320 0.5847 nan 0.0100 -0.0000
## 340 0.5699 nan 0.0100 -0.0004
## 360 0.5556 nan 0.0100 -0.0005
## 380 0.5429 nan 0.0100 -0.0003
## 400 0.5305 nan 0.0100 -0.0004
## 420 0.5179 nan 0.0100 -0.0003
## 440 0.5064 nan 0.0100 -0.0003
## 460 0.4955 nan 0.0100 -0.0005
## 480 0.4856 nan 0.0100 -0.0003
## 500 0.4752 nan 0.0100 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0261
## 2 1.5904 nan 0.0100 0.0252
## 3 1.5719 nan 0.0100 0.0235
## 4 1.5537 nan 0.0100 0.0240
## 5 1.5360 nan 0.0100 0.0230
## 6 1.5187 nan 0.0100 0.0206
## 7 1.5026 nan 0.0100 0.0222
## 8 1.4864 nan 0.0100 0.0198
## 9 1.4713 nan 0.0100 0.0203
## 10 1.4563 nan 0.0100 0.0207
## 20 1.3236 nan 0.0100 0.0168
## 40 1.1302 nan 0.0100 0.0087
## 60 0.9935 nan 0.0100 0.0055
## 80 0.8948 nan 0.0100 0.0048
## 100 0.8153 nan 0.0100 0.0029
## 120 0.7528 nan 0.0100 0.0022
## 140 0.7018 nan 0.0100 0.0014
## 160 0.6589 nan 0.0100 0.0009
## 180 0.6211 nan 0.0100 0.0004
## 200 0.5890 nan 0.0100 0.0004
## 220 0.5605 nan 0.0100 0.0004
## 240 0.5355 nan 0.0100 -0.0004
## 260 0.5131 nan 0.0100 -0.0002
## 280 0.4929 nan 0.0100 0.0002
## 300 0.4732 nan 0.0100 0.0001
## 320 0.4558 nan 0.0100 -0.0001
## 340 0.4393 nan 0.0100 -0.0006
## 360 0.4237 nan 0.0100 -0.0006
## 380 0.4091 nan 0.0100 -0.0002
## 400 0.3958 nan 0.0100 -0.0005
## 420 0.3831 nan 0.0100 -0.0003
## 440 0.3714 nan 0.0100 -0.0003
## 460 0.3602 nan 0.0100 -0.0004
## 480 0.3494 nan 0.0100 -0.0004
## 500 0.3393 nan 0.0100 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0263
## 2 1.5907 nan 0.0100 0.0235
## 3 1.5730 nan 0.0100 0.0269
## 4 1.5544 nan 0.0100 0.0235
## 5 1.5374 nan 0.0100 0.0228
## 6 1.5205 nan 0.0100 0.0220
## 7 1.5040 nan 0.0100 0.0207
## 8 1.4891 nan 0.0100 0.0185
## 9 1.4746 nan 0.0100 0.0197
## 10 1.4594 nan 0.0100 0.0207
## 20 1.3275 nan 0.0100 0.0149
## 40 1.1367 nan 0.0100 0.0104
## 60 1.0020 nan 0.0100 0.0072
## 80 0.8988 nan 0.0100 0.0045
## 100 0.8214 nan 0.0100 0.0046
## 120 0.7586 nan 0.0100 0.0017
## 140 0.7066 nan 0.0100 0.0010
## 160 0.6638 nan 0.0100 0.0011
## 180 0.6278 nan 0.0100 0.0005
## 200 0.5965 nan 0.0100 0.0002
## 220 0.5672 nan 0.0100 0.0007
## 240 0.5415 nan 0.0100 0.0002
## 260 0.5186 nan 0.0100 0.0000
## 280 0.4980 nan 0.0100 -0.0004
## 300 0.4789 nan 0.0100 -0.0003
## 320 0.4611 nan 0.0100 -0.0003
## 340 0.4445 nan 0.0100 -0.0007
## 360 0.4294 nan 0.0100 -0.0005
## 380 0.4162 nan 0.0100 -0.0005
## 400 0.4023 nan 0.0100 -0.0006
## 420 0.3894 nan 0.0100 -0.0003
## 440 0.3776 nan 0.0100 -0.0010
## 460 0.3665 nan 0.0100 -0.0007
## 480 0.3563 nan 0.0100 -0.0010
## 500 0.3466 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1180
## 2 1.5281 nan 0.0500 0.0871
## 3 1.4629 nan 0.0500 0.0778
## 4 1.4021 nan 0.0500 0.0702
## 5 1.3510 nan 0.0500 0.0611
## 6 1.3048 nan 0.0500 0.0595
## 7 1.2616 nan 0.0500 0.0496
## 8 1.2239 nan 0.0500 0.0419
## 9 1.1892 nan 0.0500 0.0374
## 10 1.1583 nan 0.0500 0.0284
## 20 0.9327 nan 0.0500 0.0144
## 40 0.7167 nan 0.0500 0.0024
## 60 0.6023 nan 0.0500 -0.0006
## 80 0.5276 nan 0.0500 -0.0023
## 100 0.4738 nan 0.0500 -0.0044
## 120 0.4331 nan 0.0500 -0.0046
## 140 0.3974 nan 0.0500 -0.0020
## 160 0.3653 nan 0.0500 -0.0040
## 180 0.3385 nan 0.0500 -0.0042
## 200 0.3132 nan 0.0500 -0.0030
## 220 0.2910 nan 0.0500 -0.0025
## 240 0.2711 nan 0.0500 -0.0030
## 260 0.2547 nan 0.0500 -0.0010
## 280 0.2386 nan 0.0500 -0.0019
## 300 0.2245 nan 0.0500 -0.0020
## 320 0.2111 nan 0.0500 -0.0017
## 340 0.1997 nan 0.0500 -0.0010
## 360 0.1885 nan 0.0500 -0.0014
## 380 0.1781 nan 0.0500 -0.0019
## 400 0.1684 nan 0.0500 -0.0013
## 420 0.1595 nan 0.0500 -0.0016
## 440 0.1504 nan 0.0500 -0.0011
## 460 0.1419 nan 0.0500 -0.0014
## 480 0.1348 nan 0.0500 -0.0012
## 500 0.1279 nan 0.0500 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1094
## 2 1.5306 nan 0.0500 0.0979
## 3 1.4635 nan 0.0500 0.0803
## 4 1.4046 nan 0.0500 0.0696
## 5 1.3525 nan 0.0500 0.0496
## 6 1.3097 nan 0.0500 0.0602
## 7 1.2653 nan 0.0500 0.0446
## 8 1.2282 nan 0.0500 0.0446
## 9 1.1927 nan 0.0500 0.0407
## 10 1.1606 nan 0.0500 0.0350
## 20 0.9362 nan 0.0500 0.0116
## 40 0.7166 nan 0.0500 0.0026
## 60 0.6070 nan 0.0500 -0.0008
## 80 0.5323 nan 0.0500 -0.0031
## 100 0.4796 nan 0.0500 -0.0017
## 120 0.4375 nan 0.0500 -0.0010
## 140 0.3997 nan 0.0500 -0.0051
## 160 0.3691 nan 0.0500 -0.0030
## 180 0.3411 nan 0.0500 -0.0036
## 200 0.3172 nan 0.0500 -0.0029
## 220 0.2947 nan 0.0500 -0.0033
## 240 0.2747 nan 0.0500 -0.0021
## 260 0.2571 nan 0.0500 -0.0024
## 280 0.2418 nan 0.0500 -0.0026
## 300 0.2270 nan 0.0500 -0.0029
## 320 0.2136 nan 0.0500 -0.0018
## 340 0.2020 nan 0.0500 -0.0025
## 360 0.1909 nan 0.0500 -0.0023
## 380 0.1806 nan 0.0500 -0.0022
## 400 0.1713 nan 0.0500 -0.0021
## 420 0.1626 nan 0.0500 -0.0015
## 440 0.1542 nan 0.0500 -0.0019
## 460 0.1457 nan 0.0500 -0.0013
## 480 0.1386 nan 0.0500 -0.0013
## 500 0.1317 nan 0.0500 -0.0018
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1149
## 2 1.5195 nan 0.0500 0.1123
## 3 1.4362 nan 0.0500 0.0958
## 4 1.3683 nan 0.0500 0.0783
## 5 1.3082 nan 0.0500 0.0715
## 6 1.2533 nan 0.0500 0.0590
## 7 1.2012 nan 0.0500 0.0555
## 8 1.1572 nan 0.0500 0.0450
## 9 1.1189 nan 0.0500 0.0416
## 10 1.0822 nan 0.0500 0.0403
## 20 0.8307 nan 0.0500 0.0140
## 40 0.5948 nan 0.0500 0.0025
## 60 0.4759 nan 0.0500 0.0005
## 80 0.3936 nan 0.0500 -0.0021
## 100 0.3354 nan 0.0500 -0.0041
## 120 0.2934 nan 0.0500 -0.0029
## 140 0.2566 nan 0.0500 -0.0022
## 160 0.2269 nan 0.0500 -0.0022
## 180 0.2041 nan 0.0500 -0.0026
## 200 0.1817 nan 0.0500 -0.0018
## 220 0.1623 nan 0.0500 -0.0016
## 240 0.1453 nan 0.0500 -0.0016
## 260 0.1307 nan 0.0500 -0.0011
## 280 0.1178 nan 0.0500 -0.0017
## 300 0.1072 nan 0.0500 -0.0017
## 320 0.0977 nan 0.0500 -0.0018
## 340 0.0894 nan 0.0500 -0.0015
## 360 0.0818 nan 0.0500 -0.0011
## 380 0.0753 nan 0.0500 -0.0014
## 400 0.0693 nan 0.0500 -0.0007
## 420 0.0632 nan 0.0500 -0.0007
## 440 0.0580 nan 0.0500 -0.0005
## 460 0.0536 nan 0.0500 -0.0007
## 480 0.0492 nan 0.0500 -0.0008
## 500 0.0453 nan 0.0500 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1232
## 2 1.5168 nan 0.0500 0.1082
## 3 1.4364 nan 0.0500 0.0902
## 4 1.3688 nan 0.0500 0.0857
## 5 1.3074 nan 0.0500 0.0664
## 6 1.2520 nan 0.0500 0.0644
## 7 1.2025 nan 0.0500 0.0550
## 8 1.1595 nan 0.0500 0.0427
## 9 1.1219 nan 0.0500 0.0457
## 10 1.0851 nan 0.0500 0.0490
## 20 0.8284 nan 0.0500 0.0081
## 40 0.6022 nan 0.0500 0.0007
## 60 0.4847 nan 0.0500 -0.0027
## 80 0.4071 nan 0.0500 -0.0029
## 100 0.3499 nan 0.0500 -0.0029
## 120 0.3051 nan 0.0500 -0.0045
## 140 0.2679 nan 0.0500 -0.0033
## 160 0.2359 nan 0.0500 -0.0026
## 180 0.2098 nan 0.0500 -0.0022
## 200 0.1872 nan 0.0500 -0.0026
## 220 0.1676 nan 0.0500 -0.0024
## 240 0.1501 nan 0.0500 -0.0025
## 260 0.1359 nan 0.0500 -0.0019
## 280 0.1232 nan 0.0500 -0.0025
## 300 0.1124 nan 0.0500 -0.0015
## 320 0.1023 nan 0.0500 -0.0017
## 340 0.0933 nan 0.0500 -0.0011
## 360 0.0855 nan 0.0500 -0.0016
## 380 0.0780 nan 0.0500 -0.0012
## 400 0.0708 nan 0.0500 -0.0007
## 420 0.0646 nan 0.0500 -0.0011
## 440 0.0590 nan 0.0500 -0.0007
## 460 0.0540 nan 0.0500 -0.0006
## 480 0.0494 nan 0.0500 -0.0006
## 500 0.0457 nan 0.0500 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1852
## 2 1.4637 nan 0.1000 0.1348
## 3 1.3565 nan 0.1000 0.1181
## 4 1.2658 nan 0.1000 0.1028
## 5 1.1875 nan 0.1000 0.0657
## 6 1.1250 nan 0.1000 0.0569
## 7 1.0721 nan 0.1000 0.0520
## 8 1.0227 nan 0.1000 0.0434
## 9 0.9833 nan 0.1000 0.0331
## 10 0.9496 nan 0.1000 0.0284
## 20 0.7261 nan 0.1000 0.0036
## 40 0.5383 nan 0.1000 -0.0064
## 60 0.4425 nan 0.1000 -0.0042
## 80 0.3734 nan 0.1000 -0.0078
## 100 0.3198 nan 0.1000 -0.0070
## 120 0.2825 nan 0.1000 -0.0058
## 140 0.2492 nan 0.1000 -0.0067
## 160 0.2180 nan 0.1000 -0.0065
## 180 0.1924 nan 0.1000 -0.0040
## 200 0.1704 nan 0.1000 -0.0034
## 220 0.1509 nan 0.1000 -0.0025
## 240 0.1363 nan 0.1000 -0.0031
## 260 0.1228 nan 0.1000 -0.0041
## 280 0.1110 nan 0.1000 -0.0037
## 300 0.1001 nan 0.1000 -0.0021
## 320 0.0897 nan 0.1000 -0.0019
## 340 0.0819 nan 0.1000 -0.0015
## 360 0.0747 nan 0.1000 -0.0020
## 380 0.0680 nan 0.1000 -0.0019
## 400 0.0622 nan 0.1000 -0.0017
## 420 0.0574 nan 0.1000 -0.0021
## 440 0.0530 nan 0.1000 -0.0009
## 460 0.0487 nan 0.1000 -0.0015
## 480 0.0449 nan 0.1000 -0.0021
## 500 0.0419 nan 0.1000 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1745
## 2 1.4667 nan 0.1000 0.1557
## 3 1.3514 nan 0.1000 0.1076
## 4 1.2628 nan 0.1000 0.0912
## 5 1.1904 nan 0.1000 0.0909
## 6 1.1205 nan 0.1000 0.0531
## 7 1.0683 nan 0.1000 0.0664
## 8 1.0146 nan 0.1000 0.0419
## 9 0.9731 nan 0.1000 0.0368
## 10 0.9371 nan 0.1000 0.0281
## 20 0.7233 nan 0.1000 0.0056
## 40 0.5422 nan 0.1000 -0.0027
## 60 0.4459 nan 0.1000 -0.0048
## 80 0.3793 nan 0.1000 -0.0059
## 100 0.3275 nan 0.1000 -0.0065
## 120 0.2847 nan 0.1000 -0.0067
## 140 0.2513 nan 0.1000 -0.0085
## 160 0.2231 nan 0.1000 -0.0047
## 180 0.1985 nan 0.1000 -0.0043
## 200 0.1783 nan 0.1000 -0.0040
## 220 0.1594 nan 0.1000 -0.0051
## 240 0.1444 nan 0.1000 -0.0034
## 260 0.1289 nan 0.1000 -0.0029
## 280 0.1163 nan 0.1000 -0.0031
## 300 0.1064 nan 0.1000 -0.0028
## 320 0.0961 nan 0.1000 -0.0022
## 340 0.0879 nan 0.1000 -0.0019
## 360 0.0803 nan 0.1000 -0.0012
## 380 0.0731 nan 0.1000 -0.0025
## 400 0.0672 nan 0.1000 -0.0027
## 420 0.0619 nan 0.1000 -0.0009
## 440 0.0568 nan 0.1000 -0.0007
## 460 0.0519 nan 0.1000 -0.0015
## 480 0.0481 nan 0.1000 -0.0017
## 500 0.0440 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2417
## 2 1.4307 nan 0.1000 0.1746
## 3 1.2920 nan 0.1000 0.1187
## 4 1.1901 nan 0.1000 0.1004
## 5 1.1084 nan 0.1000 0.0819
## 6 1.0395 nan 0.1000 0.0528
## 7 0.9844 nan 0.1000 0.0496
## 8 0.9355 nan 0.1000 0.0426
## 9 0.8910 nan 0.1000 0.0338
## 10 0.8503 nan 0.1000 0.0228
## 20 0.6111 nan 0.1000 0.0032
## 40 0.4103 nan 0.1000 -0.0011
## 60 0.2993 nan 0.1000 -0.0060
## 80 0.2299 nan 0.1000 -0.0024
## 100 0.1842 nan 0.1000 -0.0046
## 120 0.1495 nan 0.1000 -0.0027
## 140 0.1225 nan 0.1000 -0.0037
## 160 0.1004 nan 0.1000 -0.0024
## 180 0.0825 nan 0.1000 -0.0019
## 200 0.0692 nan 0.1000 -0.0015
## 220 0.0581 nan 0.1000 -0.0011
## 240 0.0491 nan 0.1000 -0.0015
## 260 0.0417 nan 0.1000 -0.0012
## 280 0.0353 nan 0.1000 -0.0011
## 300 0.0299 nan 0.1000 -0.0007
## 320 0.0253 nan 0.1000 -0.0007
## 340 0.0215 nan 0.1000 -0.0007
## 360 0.0184 nan 0.1000 -0.0007
## 380 0.0156 nan 0.1000 -0.0006
## 400 0.0133 nan 0.1000 -0.0003
## 420 0.0115 nan 0.1000 -0.0003
## 440 0.0099 nan 0.1000 -0.0002
## 460 0.0085 nan 0.1000 -0.0001
## 480 0.0074 nan 0.1000 -0.0002
## 500 0.0064 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2311
## 2 1.4266 nan 0.1000 0.1589
## 3 1.2981 nan 0.1000 0.1263
## 4 1.1946 nan 0.1000 0.0995
## 5 1.1126 nan 0.1000 0.0755
## 6 1.0421 nan 0.1000 0.0732
## 7 0.9803 nan 0.1000 0.0451
## 8 0.9331 nan 0.1000 0.0381
## 9 0.8892 nan 0.1000 0.0407
## 10 0.8472 nan 0.1000 0.0314
## 20 0.6112 nan 0.1000 -0.0044
## 40 0.4195 nan 0.1000 -0.0058
## 60 0.3158 nan 0.1000 -0.0028
## 80 0.2422 nan 0.1000 -0.0074
## 100 0.1918 nan 0.1000 -0.0044
## 120 0.1552 nan 0.1000 -0.0056
## 140 0.1274 nan 0.1000 -0.0034
## 160 0.1050 nan 0.1000 -0.0025
## 180 0.0869 nan 0.1000 -0.0045
## 200 0.0725 nan 0.1000 -0.0038
## 220 0.0614 nan 0.1000 -0.0019
## 240 0.0522 nan 0.1000 -0.0015
## 260 0.0448 nan 0.1000 -0.0011
## 280 0.0385 nan 0.1000 -0.0010
## 300 0.0327 nan 0.1000 -0.0018
## 320 0.0282 nan 0.1000 -0.0012
## 340 0.0241 nan 0.1000 -0.0010
## 360 0.0207 nan 0.1000 -0.0007
## 380 0.0177 nan 0.1000 -0.0007
## 400 0.0152 nan 0.1000 -0.0008
## 420 0.0132 nan 0.1000 -0.0006
## 440 0.0115 nan 0.1000 -0.0003
## 460 0.0099 nan 0.1000 -0.0006
## 480 0.0085 nan 0.1000 -0.0004
## 500 0.0074 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0205
## 2 1.5945 nan 0.0100 0.0222
## 3 1.5791 nan 0.0100 0.0207
## 4 1.5644 nan 0.0100 0.0212
## 5 1.5496 nan 0.0100 0.0194
## 6 1.5355 nan 0.0100 0.0167
## 7 1.5221 nan 0.0100 0.0189
## 8 1.5085 nan 0.0100 0.0185
## 9 1.4956 nan 0.0100 0.0179
## 10 1.4836 nan 0.0100 0.0180
## 20 1.3737 nan 0.0100 0.0140
## 40 1.2073 nan 0.0100 0.0073
## 60 1.0897 nan 0.0100 0.0061
## 80 0.9990 nan 0.0100 0.0040
## 100 0.9251 nan 0.0100 0.0034
## 120 0.8669 nan 0.0100 0.0029
## 140 0.8165 nan 0.0100 0.0019
## 160 0.7756 nan 0.0100 0.0011
## 180 0.7401 nan 0.0100 0.0012
## 200 0.7087 nan 0.0100 0.0009
## 220 0.6803 nan 0.0100 0.0005
## 240 0.6558 nan 0.0100 0.0005
## 260 0.6340 nan 0.0100 0.0002
## 280 0.6140 nan 0.0100 0.0001
## 300 0.5955 nan 0.0100 0.0000
## 320 0.5784 nan 0.0100 -0.0000
## 340 0.5626 nan 0.0100 -0.0005
## 360 0.5480 nan 0.0100 0.0002
## 380 0.5346 nan 0.0100 -0.0001
## 400 0.5217 nan 0.0100 -0.0001
## 420 0.5105 nan 0.0100 -0.0003
## 440 0.4989 nan 0.0100 -0.0003
## 460 0.4878 nan 0.0100 -0.0002
## 480 0.4771 nan 0.0100 -0.0005
## 500 0.4670 nan 0.0100 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0205
## 2 1.5944 nan 0.0100 0.0217
## 3 1.5791 nan 0.0100 0.0182
## 4 1.5649 nan 0.0100 0.0213
## 5 1.5503 nan 0.0100 0.0192
## 6 1.5362 nan 0.0100 0.0190
## 7 1.5224 nan 0.0100 0.0170
## 8 1.5094 nan 0.0100 0.0168
## 9 1.4963 nan 0.0100 0.0181
## 10 1.4838 nan 0.0100 0.0169
## 20 1.3721 nan 0.0100 0.0126
## 40 1.2066 nan 0.0100 0.0095
## 60 1.0877 nan 0.0100 0.0053
## 80 0.9971 nan 0.0100 0.0049
## 100 0.9239 nan 0.0100 0.0031
## 120 0.8637 nan 0.0100 0.0026
## 140 0.8161 nan 0.0100 0.0023
## 160 0.7754 nan 0.0100 0.0015
## 180 0.7390 nan 0.0100 0.0014
## 200 0.7081 nan 0.0100 0.0006
## 220 0.6812 nan 0.0100 0.0008
## 240 0.6565 nan 0.0100 0.0006
## 260 0.6344 nan 0.0100 -0.0001
## 280 0.6151 nan 0.0100 0.0002
## 300 0.5967 nan 0.0100 0.0000
## 320 0.5797 nan 0.0100 -0.0001
## 340 0.5644 nan 0.0100 -0.0002
## 360 0.5500 nan 0.0100 -0.0003
## 380 0.5363 nan 0.0100 -0.0000
## 400 0.5236 nan 0.0100 -0.0003
## 420 0.5112 nan 0.0100 -0.0001
## 440 0.4998 nan 0.0100 -0.0000
## 460 0.4890 nan 0.0100 -0.0005
## 480 0.4786 nan 0.0100 -0.0004
## 500 0.4688 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0273
## 2 1.5905 nan 0.0100 0.0232
## 3 1.5722 nan 0.0100 0.0235
## 4 1.5547 nan 0.0100 0.0219
## 5 1.5389 nan 0.0100 0.0242
## 6 1.5223 nan 0.0100 0.0219
## 7 1.5065 nan 0.0100 0.0215
## 8 1.4903 nan 0.0100 0.0203
## 9 1.4746 nan 0.0100 0.0209
## 10 1.4595 nan 0.0100 0.0193
## 20 1.3300 nan 0.0100 0.0163
## 40 1.1355 nan 0.0100 0.0095
## 60 0.9977 nan 0.0100 0.0060
## 80 0.8958 nan 0.0100 0.0043
## 100 0.8182 nan 0.0100 0.0031
## 120 0.7531 nan 0.0100 0.0017
## 140 0.7003 nan 0.0100 0.0017
## 160 0.6570 nan 0.0100 0.0015
## 180 0.6199 nan 0.0100 0.0017
## 200 0.5860 nan 0.0100 0.0002
## 220 0.5562 nan 0.0100 0.0001
## 240 0.5299 nan 0.0100 -0.0003
## 260 0.5063 nan 0.0100 0.0000
## 280 0.4860 nan 0.0100 -0.0003
## 300 0.4666 nan 0.0100 -0.0004
## 320 0.4489 nan 0.0100 -0.0005
## 340 0.4324 nan 0.0100 -0.0003
## 360 0.4170 nan 0.0100 -0.0004
## 380 0.4035 nan 0.0100 -0.0003
## 400 0.3897 nan 0.0100 -0.0003
## 420 0.3770 nan 0.0100 -0.0005
## 440 0.3650 nan 0.0100 -0.0004
## 460 0.3538 nan 0.0100 -0.0006
## 480 0.3431 nan 0.0100 -0.0006
## 500 0.3326 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0260
## 2 1.5903 nan 0.0100 0.0243
## 3 1.5723 nan 0.0100 0.0254
## 4 1.5548 nan 0.0100 0.0240
## 5 1.5376 nan 0.0100 0.0222
## 6 1.5213 nan 0.0100 0.0203
## 7 1.5055 nan 0.0100 0.0218
## 8 1.4896 nan 0.0100 0.0218
## 9 1.4742 nan 0.0100 0.0207
## 10 1.4585 nan 0.0100 0.0208
## 20 1.3277 nan 0.0100 0.0154
## 40 1.1338 nan 0.0100 0.0100
## 60 0.9984 nan 0.0100 0.0062
## 80 0.8975 nan 0.0100 0.0047
## 100 0.8178 nan 0.0100 0.0029
## 120 0.7541 nan 0.0100 0.0021
## 140 0.7024 nan 0.0100 0.0006
## 160 0.6592 nan 0.0100 0.0013
## 180 0.6213 nan 0.0100 0.0002
## 200 0.5893 nan 0.0100 0.0006
## 220 0.5609 nan 0.0100 0.0002
## 240 0.5362 nan 0.0100 -0.0005
## 260 0.5138 nan 0.0100 -0.0004
## 280 0.4923 nan 0.0100 -0.0003
## 300 0.4735 nan 0.0100 -0.0001
## 320 0.4560 nan 0.0100 -0.0008
## 340 0.4395 nan 0.0100 -0.0009
## 360 0.4244 nan 0.0100 -0.0001
## 380 0.4097 nan 0.0100 -0.0007
## 400 0.3962 nan 0.0100 -0.0009
## 420 0.3840 nan 0.0100 -0.0007
## 440 0.3716 nan 0.0100 -0.0003
## 460 0.3601 nan 0.0100 -0.0002
## 480 0.3488 nan 0.0100 -0.0009
## 500 0.3388 nan 0.0100 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.0959
## 2 1.5352 nan 0.0500 0.0926
## 3 1.4658 nan 0.0500 0.0725
## 4 1.4106 nan 0.0500 0.0741
## 5 1.3585 nan 0.0500 0.0583
## 6 1.3117 nan 0.0500 0.0469
## 7 1.2709 nan 0.0500 0.0521
## 8 1.2307 nan 0.0500 0.0456
## 9 1.1950 nan 0.0500 0.0412
## 10 1.1609 nan 0.0500 0.0373
## 20 0.9341 nan 0.0500 0.0161
## 40 0.7087 nan 0.0500 0.0027
## 60 0.5958 nan 0.0500 -0.0006
## 80 0.5190 nan 0.0500 0.0009
## 100 0.4619 nan 0.0500 -0.0004
## 120 0.4217 nan 0.0500 -0.0018
## 140 0.3858 nan 0.0500 -0.0020
## 160 0.3560 nan 0.0500 -0.0036
## 180 0.3296 nan 0.0500 -0.0028
## 200 0.3054 nan 0.0500 -0.0020
## 220 0.2836 nan 0.0500 -0.0032
## 240 0.2630 nan 0.0500 -0.0027
## 260 0.2456 nan 0.0500 -0.0032
## 280 0.2297 nan 0.0500 -0.0028
## 300 0.2164 nan 0.0500 -0.0021
## 320 0.2023 nan 0.0500 -0.0022
## 340 0.1900 nan 0.0500 -0.0012
## 360 0.1785 nan 0.0500 -0.0021
## 380 0.1679 nan 0.0500 -0.0016
## 400 0.1572 nan 0.0500 -0.0007
## 420 0.1483 nan 0.0500 -0.0014
## 440 0.1398 nan 0.0500 -0.0010
## 460 0.1322 nan 0.0500 -0.0008
## 480 0.1252 nan 0.0500 -0.0011
## 500 0.1185 nan 0.0500 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1103
## 2 1.5322 nan 0.0500 0.0879
## 3 1.4673 nan 0.0500 0.0732
## 4 1.4128 nan 0.0500 0.0686
## 5 1.3616 nan 0.0500 0.0557
## 6 1.3161 nan 0.0500 0.0598
## 7 1.2705 nan 0.0500 0.0452
## 8 1.2335 nan 0.0500 0.0423
## 9 1.1982 nan 0.0500 0.0440
## 10 1.1664 nan 0.0500 0.0355
## 20 0.9344 nan 0.0500 0.0222
## 40 0.7103 nan 0.0500 0.0030
## 60 0.6010 nan 0.0500 -0.0009
## 80 0.5280 nan 0.0500 0.0004
## 100 0.4730 nan 0.0500 -0.0031
## 120 0.4296 nan 0.0500 -0.0039
## 140 0.3931 nan 0.0500 -0.0036
## 160 0.3617 nan 0.0500 -0.0029
## 180 0.3352 nan 0.0500 -0.0026
## 200 0.3119 nan 0.0500 -0.0020
## 220 0.2896 nan 0.0500 -0.0032
## 240 0.2699 nan 0.0500 -0.0030
## 260 0.2514 nan 0.0500 -0.0029
## 280 0.2354 nan 0.0500 -0.0029
## 300 0.2217 nan 0.0500 -0.0024
## 320 0.2096 nan 0.0500 -0.0019
## 340 0.1966 nan 0.0500 -0.0019
## 360 0.1844 nan 0.0500 -0.0020
## 380 0.1742 nan 0.0500 -0.0013
## 400 0.1643 nan 0.0500 -0.0015
## 420 0.1564 nan 0.0500 -0.0025
## 440 0.1475 nan 0.0500 -0.0016
## 460 0.1401 nan 0.0500 -0.0017
## 480 0.1325 nan 0.0500 -0.0013
## 500 0.1258 nan 0.0500 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1202
## 2 1.5197 nan 0.0500 0.1065
## 3 1.4428 nan 0.0500 0.0919
## 4 1.3722 nan 0.0500 0.0828
## 5 1.3114 nan 0.0500 0.0771
## 6 1.2547 nan 0.0500 0.0661
## 7 1.2038 nan 0.0500 0.0495
## 8 1.1630 nan 0.0500 0.0486
## 9 1.1226 nan 0.0500 0.0508
## 10 1.0830 nan 0.0500 0.0410
## 20 0.8308 nan 0.0500 0.0162
## 40 0.5991 nan 0.0500 0.0012
## 60 0.4758 nan 0.0500 -0.0016
## 80 0.4000 nan 0.0500 -0.0028
## 100 0.3431 nan 0.0500 -0.0030
## 120 0.2971 nan 0.0500 -0.0029
## 140 0.2620 nan 0.0500 -0.0020
## 160 0.2312 nan 0.0500 -0.0022
## 180 0.2059 nan 0.0500 -0.0018
## 200 0.1838 nan 0.0500 -0.0013
## 220 0.1641 nan 0.0500 -0.0017
## 240 0.1464 nan 0.0500 -0.0025
## 260 0.1324 nan 0.0500 -0.0009
## 280 0.1206 nan 0.0500 -0.0017
## 300 0.1090 nan 0.0500 -0.0024
## 320 0.0990 nan 0.0500 -0.0007
## 340 0.0899 nan 0.0500 -0.0013
## 360 0.0820 nan 0.0500 -0.0011
## 380 0.0750 nan 0.0500 -0.0009
## 400 0.0679 nan 0.0500 -0.0008
## 420 0.0623 nan 0.0500 -0.0008
## 440 0.0572 nan 0.0500 -0.0010
## 460 0.0524 nan 0.0500 -0.0013
## 480 0.0480 nan 0.0500 -0.0009
## 500 0.0442 nan 0.0500 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1126
## 2 1.5195 nan 0.0500 0.1070
## 3 1.4392 nan 0.0500 0.0910
## 4 1.3673 nan 0.0500 0.0764
## 5 1.3083 nan 0.0500 0.0723
## 6 1.2512 nan 0.0500 0.0665
## 7 1.2009 nan 0.0500 0.0540
## 8 1.1566 nan 0.0500 0.0579
## 9 1.1161 nan 0.0500 0.0442
## 10 1.0796 nan 0.0500 0.0364
## 20 0.8307 nan 0.0500 0.0115
## 40 0.5947 nan 0.0500 -0.0001
## 60 0.4814 nan 0.0500 -0.0039
## 80 0.4035 nan 0.0500 -0.0030
## 100 0.3462 nan 0.0500 -0.0026
## 120 0.3017 nan 0.0500 -0.0041
## 140 0.2637 nan 0.0500 -0.0045
## 160 0.2331 nan 0.0500 -0.0015
## 180 0.2070 nan 0.0500 -0.0020
## 200 0.1849 nan 0.0500 -0.0030
## 220 0.1641 nan 0.0500 -0.0022
## 240 0.1475 nan 0.0500 -0.0017
## 260 0.1337 nan 0.0500 -0.0012
## 280 0.1197 nan 0.0500 -0.0023
## 300 0.1084 nan 0.0500 -0.0014
## 320 0.0982 nan 0.0500 -0.0018
## 340 0.0894 nan 0.0500 -0.0013
## 360 0.0811 nan 0.0500 -0.0014
## 380 0.0742 nan 0.0500 -0.0013
## 400 0.0682 nan 0.0500 -0.0013
## 420 0.0624 nan 0.0500 -0.0007
## 440 0.0572 nan 0.0500 -0.0013
## 460 0.0527 nan 0.0500 -0.0009
## 480 0.0482 nan 0.0500 -0.0006
## 500 0.0441 nan 0.0500 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1915
## 2 1.4642 nan 0.1000 0.1540
## 3 1.3487 nan 0.1000 0.1113
## 4 1.2644 nan 0.1000 0.1087
## 5 1.1846 nan 0.1000 0.0546
## 6 1.1321 nan 0.1000 0.0638
## 7 1.0783 nan 0.1000 0.0515
## 8 1.0306 nan 0.1000 0.0369
## 9 0.9900 nan 0.1000 0.0329
## 10 0.9567 nan 0.1000 0.0422
## 20 0.7238 nan 0.1000 0.0036
## 40 0.5318 nan 0.1000 -0.0041
## 60 0.4346 nan 0.1000 -0.0052
## 80 0.3665 nan 0.1000 -0.0049
## 100 0.3139 nan 0.1000 -0.0058
## 120 0.2710 nan 0.1000 -0.0041
## 140 0.2385 nan 0.1000 -0.0026
## 160 0.2078 nan 0.1000 -0.0038
## 180 0.1828 nan 0.1000 -0.0048
## 200 0.1627 nan 0.1000 -0.0030
## 220 0.1443 nan 0.1000 -0.0027
## 240 0.1286 nan 0.1000 -0.0021
## 260 0.1151 nan 0.1000 -0.0026
## 280 0.1022 nan 0.1000 -0.0027
## 300 0.0926 nan 0.1000 -0.0025
## 320 0.0841 nan 0.1000 -0.0022
## 340 0.0770 nan 0.1000 -0.0024
## 360 0.0704 nan 0.1000 -0.0021
## 380 0.0637 nan 0.1000 -0.0015
## 400 0.0585 nan 0.1000 -0.0013
## 420 0.0534 nan 0.1000 -0.0009
## 440 0.0486 nan 0.1000 -0.0013
## 460 0.0441 nan 0.1000 -0.0021
## 480 0.0408 nan 0.1000 -0.0006
## 500 0.0378 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2018
## 2 1.4576 nan 0.1000 0.1439
## 3 1.3523 nan 0.1000 0.1133
## 4 1.2618 nan 0.1000 0.0762
## 5 1.1940 nan 0.1000 0.0866
## 6 1.1260 nan 0.1000 0.0495
## 7 1.0797 nan 0.1000 0.0646
## 8 1.0286 nan 0.1000 0.0373
## 9 0.9893 nan 0.1000 0.0433
## 10 0.9484 nan 0.1000 0.0326
## 20 0.7262 nan 0.1000 0.0063
## 40 0.5342 nan 0.1000 -0.0014
## 60 0.4319 nan 0.1000 -0.0126
## 80 0.3623 nan 0.1000 -0.0067
## 100 0.3136 nan 0.1000 -0.0053
## 120 0.2707 nan 0.1000 -0.0062
## 140 0.2352 nan 0.1000 -0.0064
## 160 0.2084 nan 0.1000 -0.0048
## 180 0.1858 nan 0.1000 -0.0060
## 200 0.1639 nan 0.1000 -0.0030
## 220 0.1451 nan 0.1000 -0.0033
## 240 0.1277 nan 0.1000 -0.0023
## 260 0.1152 nan 0.1000 -0.0021
## 280 0.1038 nan 0.1000 -0.0022
## 300 0.0934 nan 0.1000 -0.0023
## 320 0.0848 nan 0.1000 -0.0020
## 340 0.0766 nan 0.1000 -0.0015
## 360 0.0692 nan 0.1000 -0.0015
## 380 0.0629 nan 0.1000 -0.0015
## 400 0.0574 nan 0.1000 -0.0011
## 420 0.0522 nan 0.1000 -0.0010
## 440 0.0482 nan 0.1000 -0.0014
## 460 0.0443 nan 0.1000 -0.0012
## 480 0.0406 nan 0.1000 -0.0019
## 500 0.0374 nan 0.1000 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2078
## 2 1.4373 nan 0.1000 0.1693
## 3 1.3037 nan 0.1000 0.1209
## 4 1.1984 nan 0.1000 0.0883
## 5 1.1110 nan 0.1000 0.0783
## 6 1.0402 nan 0.1000 0.0698
## 7 0.9776 nan 0.1000 0.0460
## 8 0.9266 nan 0.1000 0.0384
## 9 0.8856 nan 0.1000 0.0434
## 10 0.8430 nan 0.1000 0.0310
## 20 0.5945 nan 0.1000 0.0023
## 40 0.3987 nan 0.1000 -0.0061
## 60 0.2980 nan 0.1000 -0.0039
## 80 0.2282 nan 0.1000 -0.0060
## 100 0.1824 nan 0.1000 -0.0017
## 120 0.1471 nan 0.1000 -0.0047
## 140 0.1195 nan 0.1000 -0.0029
## 160 0.0981 nan 0.1000 -0.0031
## 180 0.0821 nan 0.1000 -0.0026
## 200 0.0681 nan 0.1000 -0.0014
## 220 0.0565 nan 0.1000 -0.0012
## 240 0.0479 nan 0.1000 -0.0012
## 260 0.0409 nan 0.1000 -0.0016
## 280 0.0353 nan 0.1000 -0.0020
## 300 0.0303 nan 0.1000 -0.0006
## 320 0.0261 nan 0.1000 -0.0006
## 340 0.0227 nan 0.1000 -0.0011
## 360 0.0199 nan 0.1000 -0.0004
## 380 0.0175 nan 0.1000 -0.0006
## 400 0.0152 nan 0.1000 -0.0014
## 420 0.0135 nan 0.1000 -0.0005
## 440 0.0123 nan 0.1000 -0.0002
## 460 0.0107 nan 0.1000 -0.0012
## 480 0.0095 nan 0.1000 -0.0012
## 500 0.0084 nan 0.1000 -0.0012
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2442
## 2 1.4320 nan 0.1000 0.1754
## 3 1.3007 nan 0.1000 0.1376
## 4 1.1922 nan 0.1000 0.0870
## 5 1.1156 nan 0.1000 0.0935
## 6 1.0374 nan 0.1000 0.0703
## 7 0.9773 nan 0.1000 0.0627
## 8 0.9245 nan 0.1000 0.0508
## 9 0.8803 nan 0.1000 0.0356
## 10 0.8387 nan 0.1000 0.0300
## 20 0.6067 nan 0.1000 0.0109
## 40 0.4100 nan 0.1000 -0.0080
## 60 0.3054 nan 0.1000 -0.0078
## 80 0.2345 nan 0.1000 -0.0078
## 100 0.1833 nan 0.1000 -0.0050
## 120 0.1469 nan 0.1000 -0.0047
## 140 0.1193 nan 0.1000 -0.0045
## 160 0.0973 nan 0.1000 -0.0030
## 180 0.0804 nan 0.1000 -0.0036
## 200 0.0668 nan 0.1000 -0.0023
## 220 0.0552 nan 0.1000 -0.0012
## 240 0.0473 nan 0.1000 -0.0018
## 260 0.0398 nan 0.1000 -0.0017
## 280 0.0344 nan 0.1000 -0.0016
## 300 0.0295 nan 0.1000 -0.0008
## 320 0.0253 nan 0.1000 -0.0007
## 340 0.0218 nan 0.1000 -0.0012
## 360 0.0191 nan 0.1000 -0.0016
## 380 0.0169 nan 0.1000 -0.0005
## 400 0.0147 nan 0.1000 -0.0012
## 420 0.0129 nan 0.1000 -0.0005
## 440 0.0113 nan 0.1000 -0.0011
## 460 0.0100 nan 0.1000 -0.0003
## 480 0.0089 nan 0.1000 -0.0001
## 500 0.0078 nan 0.1000 -0.0013
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0196
## 2 1.5940 nan 0.0100 0.0189
## 3 1.5789 nan 0.0100 0.0191
## 4 1.5639 nan 0.0100 0.0189
## 5 1.5495 nan 0.0100 0.0213
## 6 1.5347 nan 0.0100 0.0170
## 7 1.5221 nan 0.0100 0.0167
## 8 1.5090 nan 0.0100 0.0186
## 9 1.4966 nan 0.0100 0.0177
## 10 1.4841 nan 0.0100 0.0161
## 20 1.3730 nan 0.0100 0.0142
## 40 1.2092 nan 0.0100 0.0100
## 60 1.0862 nan 0.0100 0.0077
## 80 0.9941 nan 0.0100 0.0048
## 100 0.9190 nan 0.0100 0.0033
## 120 0.8592 nan 0.0100 0.0021
## 140 0.8089 nan 0.0100 0.0013
## 160 0.7669 nan 0.0100 0.0010
## 180 0.7317 nan 0.0100 0.0014
## 200 0.7013 nan 0.0100 0.0005
## 220 0.6738 nan 0.0100 0.0006
## 240 0.6495 nan 0.0100 0.0005
## 260 0.6274 nan 0.0100 -0.0002
## 280 0.6074 nan 0.0100 0.0001
## 300 0.5898 nan 0.0100 0.0002
## 320 0.5727 nan 0.0100 -0.0001
## 340 0.5566 nan 0.0100 -0.0003
## 360 0.5416 nan 0.0100 -0.0006
## 380 0.5283 nan 0.0100 -0.0002
## 400 0.5162 nan 0.0100 -0.0002
## 420 0.5042 nan 0.0100 -0.0006
## 440 0.4931 nan 0.0100 -0.0004
## 460 0.4825 nan 0.0100 -0.0003
## 480 0.4723 nan 0.0100 -0.0005
## 500 0.4628 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0227
## 2 1.5940 nan 0.0100 0.0209
## 3 1.5786 nan 0.0100 0.0203
## 4 1.5637 nan 0.0100 0.0186
## 5 1.5496 nan 0.0100 0.0188
## 6 1.5359 nan 0.0100 0.0177
## 7 1.5226 nan 0.0100 0.0194
## 8 1.5092 nan 0.0100 0.0181
## 9 1.4966 nan 0.0100 0.0169
## 10 1.4841 nan 0.0100 0.0175
## 20 1.3726 nan 0.0100 0.0131
## 40 1.2065 nan 0.0100 0.0093
## 60 1.0864 nan 0.0100 0.0074
## 80 0.9930 nan 0.0100 0.0048
## 100 0.9201 nan 0.0100 0.0036
## 120 0.8600 nan 0.0100 0.0029
## 140 0.8113 nan 0.0100 0.0020
## 160 0.7694 nan 0.0100 0.0017
## 180 0.7339 nan 0.0100 0.0006
## 200 0.7029 nan 0.0100 0.0007
## 220 0.6768 nan 0.0100 0.0006
## 240 0.6525 nan 0.0100 0.0008
## 260 0.6315 nan 0.0100 0.0002
## 280 0.6110 nan 0.0100 0.0003
## 300 0.5939 nan 0.0100 0.0003
## 320 0.5770 nan 0.0100 0.0002
## 340 0.5621 nan 0.0100 -0.0005
## 360 0.5481 nan 0.0100 -0.0001
## 380 0.5349 nan 0.0100 -0.0007
## 400 0.5223 nan 0.0100 -0.0002
## 420 0.5103 nan 0.0100 -0.0004
## 440 0.4989 nan 0.0100 -0.0003
## 460 0.4883 nan 0.0100 -0.0003
## 480 0.4782 nan 0.0100 -0.0005
## 500 0.4686 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0273
## 2 1.5893 nan 0.0100 0.0246
## 3 1.5713 nan 0.0100 0.0212
## 4 1.5537 nan 0.0100 0.0248
## 5 1.5359 nan 0.0100 0.0210
## 6 1.5192 nan 0.0100 0.0232
## 7 1.5023 nan 0.0100 0.0222
## 8 1.4858 nan 0.0100 0.0204
## 9 1.4700 nan 0.0100 0.0193
## 10 1.4551 nan 0.0100 0.0197
## 20 1.3217 nan 0.0100 0.0142
## 40 1.1288 nan 0.0100 0.0090
## 60 0.9926 nan 0.0100 0.0061
## 80 0.8895 nan 0.0100 0.0041
## 100 0.8082 nan 0.0100 0.0042
## 120 0.7432 nan 0.0100 0.0026
## 140 0.6909 nan 0.0100 0.0013
## 160 0.6470 nan 0.0100 0.0019
## 180 0.6100 nan 0.0100 0.0005
## 200 0.5782 nan 0.0100 0.0005
## 220 0.5494 nan 0.0100 0.0000
## 240 0.5244 nan 0.0100 -0.0002
## 260 0.5016 nan 0.0100 0.0002
## 280 0.4812 nan 0.0100 -0.0001
## 300 0.4634 nan 0.0100 -0.0006
## 320 0.4469 nan 0.0100 -0.0003
## 340 0.4311 nan 0.0100 -0.0003
## 360 0.4163 nan 0.0100 -0.0000
## 380 0.4027 nan 0.0100 -0.0008
## 400 0.3888 nan 0.0100 -0.0003
## 420 0.3762 nan 0.0100 -0.0003
## 440 0.3646 nan 0.0100 -0.0007
## 460 0.3536 nan 0.0100 -0.0003
## 480 0.3437 nan 0.0100 -0.0005
## 500 0.3345 nan 0.0100 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0267
## 2 1.5906 nan 0.0100 0.0236
## 3 1.5723 nan 0.0100 0.0232
## 4 1.5554 nan 0.0100 0.0226
## 5 1.5382 nan 0.0100 0.0220
## 6 1.5216 nan 0.0100 0.0215
## 7 1.5061 nan 0.0100 0.0240
## 8 1.4889 nan 0.0100 0.0222
## 9 1.4731 nan 0.0100 0.0216
## 10 1.4574 nan 0.0100 0.0207
## 20 1.3263 nan 0.0100 0.0151
## 40 1.1338 nan 0.0100 0.0108
## 60 0.9971 nan 0.0100 0.0077
## 80 0.8952 nan 0.0100 0.0045
## 100 0.8151 nan 0.0100 0.0026
## 120 0.7516 nan 0.0100 0.0021
## 140 0.6998 nan 0.0100 0.0016
## 160 0.6566 nan 0.0100 0.0009
## 180 0.6187 nan 0.0100 0.0005
## 200 0.5867 nan 0.0100 0.0004
## 220 0.5585 nan 0.0100 0.0002
## 240 0.5342 nan 0.0100 0.0001
## 260 0.5115 nan 0.0100 0.0001
## 280 0.4906 nan 0.0100 -0.0001
## 300 0.4717 nan 0.0100 -0.0003
## 320 0.4546 nan 0.0100 -0.0004
## 340 0.4386 nan 0.0100 0.0000
## 360 0.4233 nan 0.0100 -0.0005
## 380 0.4090 nan 0.0100 -0.0008
## 400 0.3965 nan 0.0100 -0.0005
## 420 0.3845 nan 0.0100 -0.0008
## 440 0.3723 nan 0.0100 -0.0003
## 460 0.3615 nan 0.0100 -0.0006
## 480 0.3509 nan 0.0100 -0.0006
## 500 0.3407 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1030
## 2 1.5344 nan 0.0500 0.0892
## 3 1.4698 nan 0.0500 0.0804
## 4 1.4093 nan 0.0500 0.0612
## 5 1.3593 nan 0.0500 0.0707
## 6 1.3110 nan 0.0500 0.0519
## 7 1.2681 nan 0.0500 0.0512
## 8 1.2309 nan 0.0500 0.0401
## 9 1.1965 nan 0.0500 0.0427
## 10 1.1643 nan 0.0500 0.0331
## 20 0.9289 nan 0.0500 0.0157
## 40 0.7088 nan 0.0500 0.0027
## 60 0.5955 nan 0.0500 -0.0007
## 80 0.5194 nan 0.0500 -0.0014
## 100 0.4642 nan 0.0500 -0.0047
## 120 0.4222 nan 0.0500 -0.0027
## 140 0.3878 nan 0.0500 -0.0033
## 160 0.3554 nan 0.0500 -0.0034
## 180 0.3303 nan 0.0500 -0.0031
## 200 0.3050 nan 0.0500 -0.0031
## 220 0.2835 nan 0.0500 -0.0021
## 240 0.2649 nan 0.0500 -0.0011
## 260 0.2465 nan 0.0500 -0.0024
## 280 0.2305 nan 0.0500 -0.0014
## 300 0.2158 nan 0.0500 -0.0019
## 320 0.2042 nan 0.0500 -0.0015
## 340 0.1918 nan 0.0500 -0.0026
## 360 0.1799 nan 0.0500 -0.0019
## 380 0.1695 nan 0.0500 -0.0016
## 400 0.1603 nan 0.0500 -0.0019
## 420 0.1519 nan 0.0500 -0.0019
## 440 0.1438 nan 0.0500 -0.0013
## 460 0.1362 nan 0.0500 -0.0019
## 480 0.1294 nan 0.0500 -0.0013
## 500 0.1230 nan 0.0500 -0.0018
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1091
## 2 1.5319 nan 0.0500 0.0884
## 3 1.4697 nan 0.0500 0.0773
## 4 1.4131 nan 0.0500 0.0704
## 5 1.3590 nan 0.0500 0.0649
## 6 1.3106 nan 0.0500 0.0570
## 7 1.2690 nan 0.0500 0.0439
## 8 1.2326 nan 0.0500 0.0459
## 9 1.1957 nan 0.0500 0.0451
## 10 1.1626 nan 0.0500 0.0366
## 20 0.9323 nan 0.0500 0.0118
## 40 0.7048 nan 0.0500 0.0042
## 60 0.5943 nan 0.0500 -0.0001
## 80 0.5235 nan 0.0500 -0.0011
## 100 0.4711 nan 0.0500 -0.0018
## 120 0.4273 nan 0.0500 -0.0012
## 140 0.3899 nan 0.0500 -0.0026
## 160 0.3607 nan 0.0500 -0.0029
## 180 0.3363 nan 0.0500 -0.0029
## 200 0.3121 nan 0.0500 -0.0022
## 220 0.2913 nan 0.0500 -0.0028
## 240 0.2741 nan 0.0500 -0.0024
## 260 0.2561 nan 0.0500 -0.0022
## 280 0.2401 nan 0.0500 -0.0028
## 300 0.2264 nan 0.0500 -0.0019
## 320 0.2125 nan 0.0500 -0.0022
## 340 0.2011 nan 0.0500 -0.0021
## 360 0.1899 nan 0.0500 -0.0013
## 380 0.1797 nan 0.0500 -0.0020
## 400 0.1705 nan 0.0500 -0.0022
## 420 0.1614 nan 0.0500 -0.0020
## 440 0.1532 nan 0.0500 -0.0016
## 460 0.1449 nan 0.0500 -0.0015
## 480 0.1375 nan 0.0500 -0.0015
## 500 0.1307 nan 0.0500 -0.0022
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1305
## 2 1.5159 nan 0.0500 0.0979
## 3 1.4372 nan 0.0500 0.0789
## 4 1.3691 nan 0.0500 0.0765
## 5 1.3065 nan 0.0500 0.0769
## 6 1.2485 nan 0.0500 0.0653
## 7 1.1955 nan 0.0500 0.0569
## 8 1.1492 nan 0.0500 0.0506
## 9 1.1069 nan 0.0500 0.0489
## 10 1.0691 nan 0.0500 0.0430
## 20 0.8095 nan 0.0500 0.0160
## 40 0.5761 nan 0.0500 -0.0009
## 60 0.4649 nan 0.0500 0.0009
## 80 0.3895 nan 0.0500 -0.0050
## 100 0.3342 nan 0.0500 -0.0034
## 120 0.2907 nan 0.0500 -0.0034
## 140 0.2566 nan 0.0500 -0.0034
## 160 0.2271 nan 0.0500 -0.0033
## 180 0.2024 nan 0.0500 -0.0028
## 200 0.1814 nan 0.0500 -0.0030
## 220 0.1628 nan 0.0500 -0.0015
## 240 0.1466 nan 0.0500 -0.0017
## 260 0.1328 nan 0.0500 -0.0017
## 280 0.1202 nan 0.0500 -0.0014
## 300 0.1090 nan 0.0500 -0.0015
## 320 0.0993 nan 0.0500 -0.0009
## 340 0.0904 nan 0.0500 -0.0010
## 360 0.0831 nan 0.0500 -0.0013
## 380 0.0763 nan 0.0500 -0.0013
## 400 0.0702 nan 0.0500 -0.0011
## 420 0.0647 nan 0.0500 -0.0008
## 440 0.0599 nan 0.0500 -0.0012
## 460 0.0551 nan 0.0500 -0.0008
## 480 0.0505 nan 0.0500 -0.0005
## 500 0.0464 nan 0.0500 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1118
## 2 1.5209 nan 0.0500 0.1058
## 3 1.4426 nan 0.0500 0.0794
## 4 1.3773 nan 0.0500 0.0872
## 5 1.3124 nan 0.0500 0.0627
## 6 1.2596 nan 0.0500 0.0495
## 7 1.2107 nan 0.0500 0.0586
## 8 1.1646 nan 0.0500 0.0561
## 9 1.1212 nan 0.0500 0.0449
## 10 1.0831 nan 0.0500 0.0371
## 20 0.8300 nan 0.0500 0.0141
## 40 0.5959 nan 0.0500 0.0016
## 60 0.4840 nan 0.0500 -0.0025
## 80 0.4046 nan 0.0500 -0.0037
## 100 0.3481 nan 0.0500 -0.0049
## 120 0.3042 nan 0.0500 -0.0031
## 140 0.2673 nan 0.0500 -0.0025
## 160 0.2349 nan 0.0500 -0.0029
## 180 0.2095 nan 0.0500 -0.0024
## 200 0.1885 nan 0.0500 -0.0039
## 220 0.1687 nan 0.0500 -0.0030
## 240 0.1521 nan 0.0500 -0.0026
## 260 0.1386 nan 0.0500 -0.0025
## 280 0.1258 nan 0.0500 -0.0021
## 300 0.1138 nan 0.0500 -0.0014
## 320 0.1038 nan 0.0500 -0.0014
## 340 0.0950 nan 0.0500 -0.0021
## 360 0.0868 nan 0.0500 -0.0012
## 380 0.0792 nan 0.0500 -0.0016
## 400 0.0722 nan 0.0500 -0.0007
## 420 0.0661 nan 0.0500 -0.0013
## 440 0.0611 nan 0.0500 -0.0007
## 460 0.0560 nan 0.0500 -0.0009
## 480 0.0516 nan 0.0500 -0.0011
## 500 0.0475 nan 0.0500 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1935
## 2 1.4650 nan 0.1000 0.1459
## 3 1.3549 nan 0.1000 0.1291
## 4 1.2592 nan 0.1000 0.0983
## 5 1.1866 nan 0.1000 0.0663
## 6 1.1269 nan 0.1000 0.0624
## 7 1.0690 nan 0.1000 0.0634
## 8 1.0202 nan 0.1000 0.0566
## 9 0.9758 nan 0.1000 0.0366
## 10 0.9366 nan 0.1000 0.0377
## 20 0.7204 nan 0.1000 0.0021
## 40 0.5310 nan 0.1000 -0.0050
## 60 0.4341 nan 0.1000 -0.0054
## 80 0.3683 nan 0.1000 -0.0029
## 100 0.3120 nan 0.1000 -0.0040
## 120 0.2718 nan 0.1000 -0.0041
## 140 0.2377 nan 0.1000 -0.0049
## 160 0.2106 nan 0.1000 -0.0055
## 180 0.1868 nan 0.1000 -0.0069
## 200 0.1684 nan 0.1000 -0.0037
## 220 0.1503 nan 0.1000 -0.0045
## 240 0.1338 nan 0.1000 -0.0032
## 260 0.1212 nan 0.1000 -0.0030
## 280 0.1089 nan 0.1000 -0.0029
## 300 0.0990 nan 0.1000 -0.0025
## 320 0.0895 nan 0.1000 -0.0019
## 340 0.0816 nan 0.1000 -0.0016
## 360 0.0749 nan 0.1000 -0.0017
## 380 0.0694 nan 0.1000 -0.0009
## 400 0.0637 nan 0.1000 -0.0015
## 420 0.0590 nan 0.1000 -0.0015
## 440 0.0542 nan 0.1000 -0.0012
## 460 0.0496 nan 0.1000 -0.0007
## 480 0.0459 nan 0.1000 -0.0009
## 500 0.0428 nan 0.1000 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1689
## 2 1.4677 nan 0.1000 0.1464
## 3 1.3521 nan 0.1000 0.1044
## 4 1.2675 nan 0.1000 0.0812
## 5 1.1951 nan 0.1000 0.0742
## 6 1.1349 nan 0.1000 0.0689
## 7 1.0802 nan 0.1000 0.0603
## 8 1.0309 nan 0.1000 0.0500
## 9 0.9856 nan 0.1000 0.0309
## 10 0.9459 nan 0.1000 0.0257
## 20 0.7164 nan 0.1000 0.0101
## 40 0.5328 nan 0.1000 -0.0037
## 60 0.4404 nan 0.1000 -0.0056
## 80 0.3728 nan 0.1000 -0.0069
## 100 0.3218 nan 0.1000 -0.0045
## 120 0.2810 nan 0.1000 -0.0065
## 140 0.2474 nan 0.1000 -0.0072
## 160 0.2168 nan 0.1000 -0.0040
## 180 0.1904 nan 0.1000 -0.0040
## 200 0.1681 nan 0.1000 -0.0042
## 220 0.1495 nan 0.1000 -0.0036
## 240 0.1344 nan 0.1000 -0.0043
## 260 0.1215 nan 0.1000 -0.0038
## 280 0.1103 nan 0.1000 -0.0038
## 300 0.1009 nan 0.1000 -0.0039
## 320 0.0922 nan 0.1000 -0.0024
## 340 0.0839 nan 0.1000 -0.0021
## 360 0.0771 nan 0.1000 -0.0029
## 380 0.0707 nan 0.1000 -0.0020
## 400 0.0653 nan 0.1000 -0.0027
## 420 0.0603 nan 0.1000 -0.0013
## 440 0.0558 nan 0.1000 -0.0020
## 460 0.0512 nan 0.1000 -0.0014
## 480 0.0474 nan 0.1000 -0.0010
## 500 0.0441 nan 0.1000 -0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2195
## 2 1.4348 nan 0.1000 0.1655
## 3 1.3093 nan 0.1000 0.1263
## 4 1.2001 nan 0.1000 0.1088
## 5 1.1115 nan 0.1000 0.0927
## 6 1.0347 nan 0.1000 0.0637
## 7 0.9690 nan 0.1000 0.0478
## 8 0.9157 nan 0.1000 0.0343
## 9 0.8711 nan 0.1000 0.0365
## 10 0.8248 nan 0.1000 0.0213
## 20 0.5939 nan 0.1000 -0.0035
## 40 0.4000 nan 0.1000 -0.0088
## 60 0.2984 nan 0.1000 -0.0081
## 80 0.2324 nan 0.1000 -0.0064
## 100 0.1862 nan 0.1000 -0.0034
## 120 0.1506 nan 0.1000 -0.0042
## 140 0.1232 nan 0.1000 -0.0034
## 160 0.1031 nan 0.1000 -0.0048
## 180 0.0851 nan 0.1000 -0.0020
## 200 0.0728 nan 0.1000 -0.0017
## 220 0.0619 nan 0.1000 -0.0017
## 240 0.0535 nan 0.1000 -0.0019
## 260 0.0465 nan 0.1000 -0.0018
## 280 0.0405 nan 0.1000 -0.0011
## 300 0.0349 nan 0.1000 -0.0010
## 320 0.0303 nan 0.1000 -0.0009
## 340 0.0261 nan 0.1000 -0.0005
## 360 0.0225 nan 0.1000 -0.0007
## 380 0.0193 nan 0.1000 -0.0005
## 400 0.0170 nan 0.1000 -0.0007
## 420 0.0150 nan 0.1000 -0.0005
## 440 0.0131 nan 0.1000 -0.0005
## 460 0.0114 nan 0.1000 -0.0003
## 480 0.0100 nan 0.1000 -0.0002
## 500 0.0088 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2263
## 2 1.4380 nan 0.1000 0.1793
## 3 1.2961 nan 0.1000 0.1212
## 4 1.1962 nan 0.1000 0.1066
## 5 1.1097 nan 0.1000 0.0817
## 6 1.0382 nan 0.1000 0.0539
## 7 0.9845 nan 0.1000 0.0568
## 8 0.9302 nan 0.1000 0.0295
## 9 0.8878 nan 0.1000 0.0266
## 10 0.8472 nan 0.1000 0.0293
## 20 0.6035 nan 0.1000 -0.0008
## 40 0.4093 nan 0.1000 -0.0075
## 60 0.3102 nan 0.1000 -0.0103
## 80 0.2398 nan 0.1000 -0.0061
## 100 0.1902 nan 0.1000 -0.0040
## 120 0.1543 nan 0.1000 -0.0027
## 140 0.1267 nan 0.1000 -0.0043
## 160 0.1045 nan 0.1000 -0.0038
## 180 0.0878 nan 0.1000 -0.0041
## 200 0.0743 nan 0.1000 -0.0032
## 220 0.0628 nan 0.1000 -0.0015
## 240 0.0533 nan 0.1000 -0.0034
## 260 0.0454 nan 0.1000 -0.0016
## 280 0.0393 nan 0.1000 -0.0015
## 300 0.0338 nan 0.1000 -0.0016
## 320 0.0292 nan 0.1000 -0.0009
## 340 0.0253 nan 0.1000 -0.0012
## 360 0.0221 nan 0.1000 -0.0007
## 380 0.0193 nan 0.1000 -0.0005
## 400 0.0170 nan 0.1000 -0.0005
## 420 0.0148 nan 0.1000 -0.0005
## 440 0.0129 nan 0.1000 -0.0005
## 460 0.0112 nan 0.1000 -0.0005
## 480 0.0096 nan 0.1000 -0.0003
## 500 0.0085 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0203
## 2 1.5946 nan 0.0100 0.0208
## 3 1.5793 nan 0.0100 0.0209
## 4 1.5642 nan 0.0100 0.0185
## 5 1.5501 nan 0.0100 0.0183
## 6 1.5370 nan 0.0100 0.0199
## 7 1.5231 nan 0.0100 0.0193
## 8 1.5098 nan 0.0100 0.0182
## 9 1.4968 nan 0.0100 0.0168
## 10 1.4847 nan 0.0100 0.0152
## 20 1.3763 nan 0.0100 0.0131
## 40 1.2112 nan 0.0100 0.0088
## 60 1.0926 nan 0.0100 0.0055
## 80 1.0040 nan 0.0100 0.0047
## 100 0.9339 nan 0.0100 0.0033
## 120 0.8742 nan 0.0100 0.0028
## 140 0.8243 nan 0.0100 0.0015
## 160 0.7835 nan 0.0100 0.0017
## 180 0.7474 nan 0.0100 0.0008
## 200 0.7167 nan 0.0100 0.0010
## 220 0.6898 nan 0.0100 0.0004
## 240 0.6660 nan 0.0100 0.0001
## 260 0.6446 nan 0.0100 0.0007
## 280 0.6253 nan 0.0100 0.0002
## 300 0.6059 nan 0.0100 0.0001
## 320 0.5884 nan 0.0100 0.0003
## 340 0.5726 nan 0.0100 -0.0001
## 360 0.5576 nan 0.0100 -0.0003
## 380 0.5442 nan 0.0100 -0.0003
## 400 0.5313 nan 0.0100 -0.0004
## 420 0.5186 nan 0.0100 -0.0002
## 440 0.5062 nan 0.0100 -0.0004
## 460 0.4955 nan 0.0100 -0.0004
## 480 0.4849 nan 0.0100 -0.0003
## 500 0.4748 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0212
## 2 1.5942 nan 0.0100 0.0214
## 3 1.5794 nan 0.0100 0.0187
## 4 1.5659 nan 0.0100 0.0192
## 5 1.5520 nan 0.0100 0.0187
## 6 1.5383 nan 0.0100 0.0189
## 7 1.5250 nan 0.0100 0.0199
## 8 1.5114 nan 0.0100 0.0177
## 9 1.4988 nan 0.0100 0.0158
## 10 1.4870 nan 0.0100 0.0173
## 20 1.3779 nan 0.0100 0.0134
## 40 1.2148 nan 0.0100 0.0092
## 60 1.0968 nan 0.0100 0.0051
## 80 1.0058 nan 0.0100 0.0051
## 100 0.9344 nan 0.0100 0.0030
## 120 0.8763 nan 0.0100 0.0029
## 140 0.8269 nan 0.0100 0.0018
## 160 0.7860 nan 0.0100 0.0013
## 180 0.7523 nan 0.0100 0.0012
## 200 0.7201 nan 0.0100 0.0003
## 220 0.6930 nan 0.0100 0.0008
## 240 0.6683 nan 0.0100 0.0003
## 260 0.6467 nan 0.0100 0.0005
## 280 0.6269 nan 0.0100 -0.0006
## 300 0.6078 nan 0.0100 0.0001
## 320 0.5918 nan 0.0100 -0.0002
## 340 0.5759 nan 0.0100 -0.0004
## 360 0.5607 nan 0.0100 -0.0004
## 380 0.5473 nan 0.0100 -0.0002
## 400 0.5341 nan 0.0100 -0.0001
## 420 0.5216 nan 0.0100 -0.0003
## 440 0.5103 nan 0.0100 -0.0008
## 460 0.4995 nan 0.0100 -0.0006
## 480 0.4890 nan 0.0100 -0.0004
## 500 0.4794 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0260
## 2 1.5907 nan 0.0100 0.0255
## 3 1.5723 nan 0.0100 0.0249
## 4 1.5545 nan 0.0100 0.0232
## 5 1.5376 nan 0.0100 0.0229
## 6 1.5212 nan 0.0100 0.0229
## 7 1.5048 nan 0.0100 0.0198
## 8 1.4896 nan 0.0100 0.0228
## 9 1.4733 nan 0.0100 0.0213
## 10 1.4578 nan 0.0100 0.0185
## 20 1.3282 nan 0.0100 0.0160
## 40 1.1383 nan 0.0100 0.0101
## 60 1.0044 nan 0.0100 0.0078
## 80 0.9046 nan 0.0100 0.0044
## 100 0.8274 nan 0.0100 0.0040
## 120 0.7630 nan 0.0100 0.0018
## 140 0.7113 nan 0.0100 0.0013
## 160 0.6670 nan 0.0100 0.0011
## 180 0.6286 nan 0.0100 0.0004
## 200 0.5947 nan 0.0100 -0.0005
## 220 0.5661 nan 0.0100 0.0001
## 240 0.5411 nan 0.0100 -0.0008
## 260 0.5177 nan 0.0100 0.0002
## 280 0.4971 nan 0.0100 0.0002
## 300 0.4784 nan 0.0100 -0.0006
## 320 0.4602 nan 0.0100 -0.0002
## 340 0.4443 nan 0.0100 -0.0002
## 360 0.4291 nan 0.0100 -0.0002
## 380 0.4146 nan 0.0100 -0.0004
## 400 0.4008 nan 0.0100 -0.0008
## 420 0.3890 nan 0.0100 -0.0003
## 440 0.3768 nan 0.0100 -0.0008
## 460 0.3655 nan 0.0100 -0.0005
## 480 0.3554 nan 0.0100 -0.0004
## 500 0.3450 nan 0.0100 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0100 0.0270
## 2 1.5910 nan 0.0100 0.0241
## 3 1.5727 nan 0.0100 0.0223
## 4 1.5553 nan 0.0100 0.0228
## 5 1.5389 nan 0.0100 0.0223
## 6 1.5227 nan 0.0100 0.0223
## 7 1.5061 nan 0.0100 0.0225
## 8 1.4907 nan 0.0100 0.0218
## 9 1.4751 nan 0.0100 0.0194
## 10 1.4600 nan 0.0100 0.0203
## 20 1.3291 nan 0.0100 0.0148
## 40 1.1414 nan 0.0100 0.0095
## 60 1.0069 nan 0.0100 0.0070
## 80 0.9055 nan 0.0100 0.0037
## 100 0.8288 nan 0.0100 0.0040
## 120 0.7652 nan 0.0100 0.0019
## 140 0.7137 nan 0.0100 0.0020
## 160 0.6705 nan 0.0100 0.0012
## 180 0.6328 nan 0.0100 0.0006
## 200 0.6000 nan 0.0100 0.0002
## 220 0.5717 nan 0.0100 0.0007
## 240 0.5463 nan 0.0100 0.0002
## 260 0.5231 nan 0.0100 -0.0001
## 280 0.5016 nan 0.0100 -0.0003
## 300 0.4817 nan 0.0100 -0.0004
## 320 0.4637 nan 0.0100 -0.0005
## 340 0.4476 nan 0.0100 -0.0001
## 360 0.4326 nan 0.0100 -0.0005
## 380 0.4188 nan 0.0100 -0.0007
## 400 0.4049 nan 0.0100 -0.0005
## 420 0.3926 nan 0.0100 -0.0005
## 440 0.3805 nan 0.0100 -0.0008
## 460 0.3691 nan 0.0100 -0.0003
## 480 0.3586 nan 0.0100 -0.0008
## 500 0.3482 nan 0.0100 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.0941
## 2 1.5354 nan 0.0500 0.0791
## 3 1.4723 nan 0.0500 0.0814
## 4 1.4128 nan 0.0500 0.0687
## 5 1.3623 nan 0.0500 0.0609
## 6 1.3168 nan 0.0500 0.0525
## 7 1.2756 nan 0.0500 0.0426
## 8 1.2384 nan 0.0500 0.0407
## 9 1.2059 nan 0.0500 0.0420
## 10 1.1739 nan 0.0500 0.0355
## 20 0.9470 nan 0.0500 0.0156
## 40 0.7247 nan 0.0500 0.0038
## 60 0.6089 nan 0.0500 -0.0020
## 80 0.5334 nan 0.0500 -0.0004
## 100 0.4801 nan 0.0500 -0.0017
## 120 0.4346 nan 0.0500 -0.0022
## 140 0.3999 nan 0.0500 -0.0022
## 160 0.3677 nan 0.0500 -0.0026
## 180 0.3411 nan 0.0500 -0.0023
## 200 0.3170 nan 0.0500 -0.0022
## 220 0.2950 nan 0.0500 -0.0034
## 240 0.2747 nan 0.0500 -0.0024
## 260 0.2579 nan 0.0500 -0.0016
## 280 0.2427 nan 0.0500 -0.0026
## 300 0.2277 nan 0.0500 -0.0028
## 320 0.2143 nan 0.0500 -0.0021
## 340 0.2024 nan 0.0500 -0.0019
## 360 0.1923 nan 0.0500 -0.0015
## 380 0.1807 nan 0.0500 -0.0021
## 400 0.1712 nan 0.0500 -0.0019
## 420 0.1622 nan 0.0500 -0.0019
## 440 0.1542 nan 0.0500 -0.0017
## 460 0.1460 nan 0.0500 -0.0024
## 480 0.1385 nan 0.0500 -0.0017
## 500 0.1318 nan 0.0500 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1094
## 2 1.5294 nan 0.0500 0.0872
## 3 1.4660 nan 0.0500 0.0860
## 4 1.4085 nan 0.0500 0.0704
## 5 1.3569 nan 0.0500 0.0556
## 6 1.3154 nan 0.0500 0.0531
## 7 1.2735 nan 0.0500 0.0471
## 8 1.2354 nan 0.0500 0.0422
## 9 1.2007 nan 0.0500 0.0360
## 10 1.1691 nan 0.0500 0.0342
## 20 0.9434 nan 0.0500 0.0187
## 40 0.7216 nan 0.0500 0.0025
## 60 0.6101 nan 0.0500 -0.0023
## 80 0.5418 nan 0.0500 -0.0016
## 100 0.4871 nan 0.0500 -0.0019
## 120 0.4416 nan 0.0500 -0.0029
## 140 0.4059 nan 0.0500 -0.0041
## 160 0.3771 nan 0.0500 -0.0032
## 180 0.3515 nan 0.0500 -0.0034
## 200 0.3278 nan 0.0500 -0.0042
## 220 0.3069 nan 0.0500 -0.0029
## 240 0.2864 nan 0.0500 -0.0025
## 260 0.2677 nan 0.0500 -0.0016
## 280 0.2519 nan 0.0500 -0.0034
## 300 0.2371 nan 0.0500 -0.0024
## 320 0.2245 nan 0.0500 -0.0023
## 340 0.2105 nan 0.0500 -0.0025
## 360 0.1986 nan 0.0500 -0.0020
## 380 0.1875 nan 0.0500 -0.0014
## 400 0.1774 nan 0.0500 -0.0022
## 420 0.1686 nan 0.0500 -0.0030
## 440 0.1600 nan 0.0500 -0.0016
## 460 0.1516 nan 0.0500 -0.0021
## 480 0.1441 nan 0.0500 -0.0017
## 500 0.1380 nan 0.0500 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1195
## 2 1.5167 nan 0.0500 0.1068
## 3 1.4360 nan 0.0500 0.0948
## 4 1.3651 nan 0.0500 0.0819
## 5 1.3042 nan 0.0500 0.0655
## 6 1.2515 nan 0.0500 0.0610
## 7 1.2039 nan 0.0500 0.0544
## 8 1.1616 nan 0.0500 0.0471
## 9 1.1234 nan 0.0500 0.0406
## 10 1.0863 nan 0.0500 0.0344
## 20 0.8391 nan 0.0500 0.0109
## 40 0.6045 nan 0.0500 0.0009
## 60 0.4834 nan 0.0500 -0.0010
## 80 0.4053 nan 0.0500 -0.0017
## 100 0.3482 nan 0.0500 -0.0033
## 120 0.3051 nan 0.0500 -0.0010
## 140 0.2673 nan 0.0500 -0.0013
## 160 0.2372 nan 0.0500 -0.0026
## 180 0.2106 nan 0.0500 -0.0018
## 200 0.1877 nan 0.0500 -0.0024
## 220 0.1703 nan 0.0500 -0.0027
## 240 0.1550 nan 0.0500 -0.0014
## 260 0.1403 nan 0.0500 -0.0021
## 280 0.1279 nan 0.0500 -0.0016
## 300 0.1163 nan 0.0500 -0.0012
## 320 0.1055 nan 0.0500 -0.0016
## 340 0.0972 nan 0.0500 -0.0017
## 360 0.0892 nan 0.0500 -0.0015
## 380 0.0822 nan 0.0500 -0.0013
## 400 0.0755 nan 0.0500 -0.0019
## 420 0.0699 nan 0.0500 -0.0006
## 440 0.0642 nan 0.0500 -0.0006
## 460 0.0596 nan 0.0500 -0.0008
## 480 0.0552 nan 0.0500 -0.0013
## 500 0.0513 nan 0.0500 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1148
## 2 1.5229 nan 0.0500 0.1047
## 3 1.4464 nan 0.0500 0.0863
## 4 1.3775 nan 0.0500 0.0770
## 5 1.3184 nan 0.0500 0.0731
## 6 1.2645 nan 0.0500 0.0550
## 7 1.2149 nan 0.0500 0.0502
## 8 1.1733 nan 0.0500 0.0491
## 9 1.1299 nan 0.0500 0.0417
## 10 1.0939 nan 0.0500 0.0404
## 20 0.8380 nan 0.0500 0.0088
## 40 0.6151 nan 0.0500 0.0003
## 60 0.4964 nan 0.0500 -0.0009
## 80 0.4131 nan 0.0500 -0.0038
## 100 0.3594 nan 0.0500 -0.0058
## 120 0.3128 nan 0.0500 -0.0022
## 140 0.2746 nan 0.0500 -0.0029
## 160 0.2435 nan 0.0500 -0.0041
## 180 0.2175 nan 0.0500 -0.0021
## 200 0.1952 nan 0.0500 -0.0031
## 220 0.1768 nan 0.0500 -0.0035
## 240 0.1596 nan 0.0500 -0.0025
## 260 0.1443 nan 0.0500 -0.0023
## 280 0.1311 nan 0.0500 -0.0019
## 300 0.1191 nan 0.0500 -0.0014
## 320 0.1088 nan 0.0500 -0.0019
## 340 0.0998 nan 0.0500 -0.0015
## 360 0.0912 nan 0.0500 -0.0015
## 380 0.0840 nan 0.0500 -0.0017
## 400 0.0773 nan 0.0500 -0.0011
## 420 0.0716 nan 0.0500 -0.0010
## 440 0.0661 nan 0.0500 -0.0010
## 460 0.0612 nan 0.0500 -0.0013
## 480 0.0566 nan 0.0500 -0.0009
## 500 0.0531 nan 0.0500 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1721
## 2 1.4729 nan 0.1000 0.1485
## 3 1.3633 nan 0.1000 0.1030
## 4 1.2792 nan 0.1000 0.1078
## 5 1.1972 nan 0.1000 0.0634
## 6 1.1375 nan 0.1000 0.0618
## 7 1.0818 nan 0.1000 0.0444
## 8 1.0364 nan 0.1000 0.0392
## 9 0.9993 nan 0.1000 0.0522
## 10 0.9612 nan 0.1000 0.0376
## 20 0.7269 nan 0.1000 0.0085
## 40 0.5302 nan 0.1000 -0.0051
## 60 0.4338 nan 0.1000 -0.0092
## 80 0.3732 nan 0.1000 -0.0083
## 100 0.3188 nan 0.1000 -0.0064
## 120 0.2791 nan 0.1000 -0.0074
## 140 0.2465 nan 0.1000 -0.0074
## 160 0.2187 nan 0.1000 -0.0048
## 180 0.1933 nan 0.1000 -0.0023
## 200 0.1748 nan 0.1000 -0.0037
## 220 0.1560 nan 0.1000 -0.0041
## 240 0.1411 nan 0.1000 -0.0037
## 260 0.1290 nan 0.1000 -0.0043
## 280 0.1182 nan 0.1000 -0.0023
## 300 0.1077 nan 0.1000 -0.0034
## 320 0.0985 nan 0.1000 -0.0031
## 340 0.0912 nan 0.1000 -0.0017
## 360 0.0849 nan 0.1000 -0.0021
## 380 0.0779 nan 0.1000 -0.0025
## 400 0.0721 nan 0.1000 -0.0014
## 420 0.0670 nan 0.1000 -0.0020
## 440 0.0624 nan 0.1000 -0.0017
## 460 0.0589 nan 0.1000 -0.0024
## 480 0.0548 nan 0.1000 -0.0014
## 500 0.0508 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1739
## 2 1.4697 nan 0.1000 0.1596
## 3 1.3584 nan 0.1000 0.1185
## 4 1.2694 nan 0.1000 0.0916
## 5 1.1979 nan 0.1000 0.0805
## 6 1.1325 nan 0.1000 0.0636
## 7 1.0767 nan 0.1000 0.0466
## 8 1.0338 nan 0.1000 0.0521
## 9 0.9918 nan 0.1000 0.0374
## 10 0.9558 nan 0.1000 0.0316
## 20 0.7385 nan 0.1000 0.0052
## 40 0.5515 nan 0.1000 -0.0024
## 60 0.4488 nan 0.1000 -0.0088
## 80 0.3777 nan 0.1000 -0.0106
## 100 0.3267 nan 0.1000 -0.0083
## 120 0.2855 nan 0.1000 -0.0074
## 140 0.2511 nan 0.1000 -0.0055
## 160 0.2209 nan 0.1000 -0.0042
## 180 0.1950 nan 0.1000 -0.0065
## 200 0.1751 nan 0.1000 -0.0046
## 220 0.1596 nan 0.1000 -0.0036
## 240 0.1445 nan 0.1000 -0.0033
## 260 0.1306 nan 0.1000 -0.0029
## 280 0.1173 nan 0.1000 -0.0021
## 300 0.1070 nan 0.1000 -0.0031
## 320 0.0986 nan 0.1000 -0.0026
## 340 0.0913 nan 0.1000 -0.0024
## 360 0.0830 nan 0.1000 -0.0020
## 380 0.0759 nan 0.1000 -0.0030
## 400 0.0699 nan 0.1000 -0.0017
## 420 0.0645 nan 0.1000 -0.0015
## 440 0.0602 nan 0.1000 -0.0024
## 460 0.0562 nan 0.1000 -0.0016
## 480 0.0523 nan 0.1000 -0.0025
## 500 0.0490 nan 0.1000 -0.0020
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.2002
## 2 1.4403 nan 0.1000 0.1602
## 3 1.3094 nan 0.1000 0.1344
## 4 1.2039 nan 0.1000 0.0939
## 5 1.1193 nan 0.1000 0.0776
## 6 1.0485 nan 0.1000 0.0552
## 7 0.9973 nan 0.1000 0.0385
## 8 0.9516 nan 0.1000 0.0494
## 9 0.9035 nan 0.1000 0.0433
## 10 0.8569 nan 0.1000 0.0262
## 20 0.6228 nan 0.1000 0.0005
## 40 0.4112 nan 0.1000 -0.0078
## 60 0.3054 nan 0.1000 -0.0062
## 80 0.2381 nan 0.1000 -0.0060
## 100 0.1906 nan 0.1000 -0.0041
## 120 0.1537 nan 0.1000 -0.0043
## 140 0.1260 nan 0.1000 -0.0022
## 160 0.1038 nan 0.1000 -0.0034
## 180 0.0876 nan 0.1000 -0.0022
## 200 0.0746 nan 0.1000 -0.0030
## 220 0.0636 nan 0.1000 -0.0023
## 240 0.0545 nan 0.1000 -0.0010
## 260 0.0464 nan 0.1000 -0.0015
## 280 0.0408 nan 0.1000 -0.0014
## 300 0.0357 nan 0.1000 -0.0010
## 320 0.0312 nan 0.1000 -0.0004
## 340 0.0276 nan 0.1000 -0.0018
## 360 0.0248 nan 0.1000 -0.0008
## 380 0.0221 nan 0.1000 -0.0022
## 400 0.0198 nan 0.1000 -0.0014
## 420 0.0175 nan 0.1000 -0.0013
## 440 0.0158 nan 0.1000 -0.0005
## 460 0.0142 nan 0.1000 -0.0019
## 480 0.0128 nan 0.1000 -0.0010
## 500 0.0116 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.1000 0.1973
## 2 1.4470 nan 0.1000 0.1737
## 3 1.3128 nan 0.1000 0.1231
## 4 1.2112 nan 0.1000 0.1055
## 5 1.1204 nan 0.1000 0.0764
## 6 1.0546 nan 0.1000 0.0641
## 7 0.9941 nan 0.1000 0.0520
## 8 0.9424 nan 0.1000 0.0416
## 9 0.8951 nan 0.1000 0.0302
## 10 0.8573 nan 0.1000 0.0295
## 20 0.6239 nan 0.1000 0.0031
## 40 0.4180 nan 0.1000 -0.0106
## 60 0.3110 nan 0.1000 -0.0073
## 80 0.2419 nan 0.1000 -0.0065
## 100 0.1925 nan 0.1000 -0.0054
## 120 0.1555 nan 0.1000 -0.0036
## 140 0.1281 nan 0.1000 -0.0046
## 160 0.1076 nan 0.1000 -0.0034
## 180 0.0900 nan 0.1000 -0.0031
## 200 0.0766 nan 0.1000 -0.0025
## 220 0.0660 nan 0.1000 -0.0019
## 240 0.0559 nan 0.1000 -0.0022
## 260 0.0490 nan 0.1000 -0.0018
## 280 0.0422 nan 0.1000 -0.0024
## 300 0.0366 nan 0.1000 -0.0016
## 320 0.0322 nan 0.1000 -0.0018
## 340 0.0289 nan 0.1000 -0.0011
## 360 0.0252 nan 0.1000 -0.0009
## 380 0.0222 nan 0.1000 -0.0017
## 400 0.0199 nan 0.1000 -0.0006
## 420 0.0184 nan 0.1000 -0.0000
## 440 0.0158 nan 0.1000 -0.0012
## 460 0.0141 nan 0.1000 -0.0009
## 480 0.0126 nan 0.1000 -0.0010
## 500 0.0113 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.6094 nan 0.0500 0.1027
## 2 1.5375 nan 0.0500 0.0893
## 3 1.4729 nan 0.0500 0.0812
## 4 1.4142 nan 0.0500 0.0649
## 5 1.3647 nan 0.0500 0.0592
## 6 1.3188 nan 0.0500 0.0574
## 7 1.2780 nan 0.0500 0.0535
## 8 1.2389 nan 0.0500 0.0467
## 9 1.2031 nan 0.0500 0.0426
## 10 1.1717 nan 0.0500 0.0356
## 20 0.9501 nan 0.0500 0.0145
## 40 0.7434 nan 0.0500 0.0037
## 60 0.6386 nan 0.0500 -0.0005
## 80 0.5676 nan 0.0500 -0.0014
## 100 0.5152 nan 0.0500 -0.0024
## 120 0.4735 nan 0.0500 -0.0020
## 140 0.4378 nan 0.0500 -0.0023
## 160 0.4067 nan 0.0500 -0.0022
## 180 0.3817 nan 0.0500 -0.0037
## 200 0.3588 nan 0.0500 -0.0016
# Display results
print(gbm_model)
## Stochastic Gradient Boosting
##
## 1119 samples
## 24 predictor
## 5 classes: 'Very.Low', 'Low', 'Medium', 'High', 'Very.High'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 894, 895, 896, 896, 895
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode n.trees logLoss AUC
## 0.01 3 5 50 1.1975894 0.8935957
## 0.01 3 5 100 1.0185192 0.9027795
## 0.01 3 5 200 0.8601795 0.9137707
## 0.01 3 5 500 0.7496364 0.9208205
## 0.01 3 10 50 1.1970004 0.8912562
## 0.01 3 10 100 1.0176038 0.9017450
## 0.01 3 10 200 0.8568673 0.9129787
## 0.01 3 10 500 0.7425910 0.9219114
## 0.01 5 5 50 1.1417479 0.9035454
## 0.01 5 5 100 0.9577028 0.9108501
## 0.01 5 5 200 0.8138751 0.9182021
## 0.01 5 5 500 0.7358703 0.9226894
## 0.01 5 10 50 1.1404788 0.9033158
## 0.01 5 10 100 0.9543388 0.9109391
## 0.01 5 10 200 0.8131035 0.9168733
## 0.01 5 10 500 0.7324219 0.9224324
## 0.05 3 5 50 0.8236527 0.9143924
## 0.05 3 5 100 0.7541132 0.9189518
## 0.05 3 5 200 0.7366327 0.9212604
## 0.05 3 5 500 0.7967584 0.9201741
## 0.05 3 10 50 0.8171226 0.9157226
## 0.05 3 10 100 0.7428381 0.9218971
## 0.05 3 10 200 0.7320242 0.9220825
## 0.05 3 10 500 0.7939535 0.9211131
## 0.05 5 5 50 0.7901598 0.9162275
## 0.05 5 5 100 0.7497184 0.9189779
## 0.05 5 5 200 0.7603532 0.9206779
## 0.05 5 5 500 0.8710712 0.9201358
## 0.05 5 10 50 0.7889842 0.9160605
## 0.05 5 10 100 0.7441024 0.9206378
## 0.05 5 10 200 0.7600574 0.9207223
## 0.05 5 10 500 0.8903962 0.9180314
## 0.10 3 5 50 0.7747682 0.9145999
## 0.10 3 5 100 0.7636093 0.9170591
## 0.10 3 5 200 0.8075171 0.9165780
## 0.10 3 5 500 0.9534187 0.9164557
## 0.10 3 10 50 0.7496734 0.9192319
## 0.10 3 10 100 0.7438215 0.9194475
## 0.10 3 10 200 0.7923560 0.9174576
## 0.10 3 10 500 0.9631401 0.9156344
## 0.10 5 5 50 0.7584156 0.9173876
## 0.10 5 5 100 0.7766152 0.9179802
## 0.10 5 5 200 0.8500079 0.9190160
## 0.10 5 5 500 1.1116954 0.9176781
## 0.10 5 10 50 0.7609235 0.9167820
## 0.10 5 10 100 0.7837275 0.9168720
## 0.10 5 10 200 0.8627810 0.9168133
## 0.10 5 10 500 1.1546745 0.9157785
## prAUC Accuracy Kappa Mean_F1 Mean_Sensitivity
## 0.6660617 0.6255774 0.5317198 0.6219892 0.6244754
## 0.6854273 0.6479469 0.5597282 0.6471046 0.6470815
## 0.7065416 0.6648994 0.5810157 0.6670221 0.6643332
## 0.7213383 0.6755700 0.5943945 0.6779027 0.6749840
## 0.6608839 0.6175775 0.5216850 0.6111745 0.6162439
## 0.6797526 0.6380776 0.5473981 0.6355610 0.6371209
## 0.7056396 0.6621849 0.5776100 0.6631118 0.6615295
## 0.7250622 0.6791654 0.5988563 0.6799430 0.6784018
## 0.6858551 0.6469982 0.5585651 0.6460084 0.6461319
## 0.7030871 0.6613439 0.5765389 0.6620311 0.6605985
## 0.7168797 0.6774394 0.5966900 0.6792752 0.6768234
## 0.7278229 0.6818479 0.6022290 0.6835443 0.6812081
## 0.6838241 0.6479031 0.5597060 0.6468811 0.6470455
## 0.7021243 0.6604232 0.5753906 0.6596813 0.6596341
## 0.7147178 0.6782964 0.5977543 0.6789638 0.6775119
## 0.7283235 0.6827448 0.6033254 0.6838968 0.6820179
## 0.7094889 0.6675459 0.5843208 0.6696044 0.6668426
## 0.7184203 0.6827328 0.6032997 0.6840863 0.6818773
## 0.7211657 0.6800662 0.6000180 0.6816989 0.6794073
## 0.7180046 0.6746930 0.5932954 0.6770157 0.6739720
## 0.7128852 0.6791733 0.5988772 0.6789338 0.6783752
## 0.7261338 0.6791534 0.5988494 0.6795546 0.6782804
## 0.7222598 0.6943402 0.6178601 0.6956087 0.6935154
## 0.7255750 0.6889789 0.6111574 0.6907421 0.6883595
## 0.7114686 0.6755659 0.5943816 0.6772796 0.6748637
## 0.7176740 0.6791534 0.5988635 0.6800352 0.6784376
## 0.7210048 0.6818241 0.6022222 0.6833096 0.6811290
## 0.7191549 0.6800502 0.5999804 0.6818195 0.6793108
## 0.7117508 0.6711216 0.5888453 0.6707696 0.6704597
## 0.7245167 0.6800224 0.5999551 0.6809372 0.6793310
## 0.7233183 0.6800264 0.5999667 0.6813030 0.6795305
## 0.7194909 0.6800342 0.5999587 0.6814334 0.6793081
## 0.7066870 0.6719706 0.5898713 0.6736919 0.6714294
## 0.7130173 0.6737842 0.5921818 0.6760223 0.6732229
## 0.7098220 0.6719905 0.5899334 0.6739033 0.6715287
## 0.7149916 0.6746530 0.5932514 0.6770835 0.6740135
## 0.7174794 0.6773637 0.5966205 0.6788559 0.6767223
## 0.7153966 0.6809113 0.6011032 0.6828866 0.6804184
## 0.7138196 0.6710858 0.5888153 0.6732288 0.6705756
## 0.7135212 0.6720026 0.5899683 0.6746279 0.6715077
## 0.7171059 0.6711416 0.5888535 0.6724464 0.6704018
## 0.7197238 0.6773757 0.5966664 0.6789332 0.6766836
## 0.7210485 0.6738162 0.5921949 0.6751321 0.6730904
## 0.7175690 0.6720146 0.5899570 0.6743314 0.6713091
## 0.7123667 0.6684031 0.5853926 0.6695654 0.6674979
## 0.7150938 0.6836137 0.6044494 0.6848499 0.6829701
## 0.7177655 0.6719985 0.5899387 0.6737336 0.6713496
## 0.7132065 0.6836257 0.6044803 0.6854336 0.6830694
## Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value Mean_Precision
## 0.9063361 0.6298799 0.9070442 0.6298799
## 0.9119316 0.6547280 0.9122841 0.6547280
## 0.9162049 0.6748881 0.9161209 0.6748881
## 0.9188965 0.6851330 0.9187311 0.6851330
## 0.9043336 0.6189713 0.9053296 0.6189713
## 0.9094758 0.6415549 0.9099969 0.6415549
## 0.9155245 0.6709139 0.9156277 0.6709139
## 0.9197854 0.6864280 0.9198527 0.6864280
## 0.9117080 0.6528234 0.9120674 0.6528234
## 0.9153086 0.6700116 0.9154281 0.6700116
## 0.9193410 0.6865559 0.9192678 0.6865559
## 0.9204608 0.6888831 0.9203235 0.6888831
## 0.9119390 0.6512752 0.9122251 0.6512752
## 0.9150812 0.6649531 0.9153875 0.6649531
## 0.9195607 0.6865802 0.9196706 0.6865802
## 0.9206742 0.6908973 0.9207049 0.6908973
## 0.9168777 0.6768374 0.9167532 0.6768374
## 0.9206730 0.6915490 0.9206794 0.6915490
## 0.9200264 0.6881300 0.9199300 0.6881300
## 0.9186843 0.6845574 0.9185052 0.6845574
## 0.9197954 0.6829296 0.9198914 0.6829296
## 0.9197954 0.6841084 0.9198066 0.6841084
## 0.9236031 0.7021926 0.9235643 0.7021926
## 0.9222523 0.6968686 0.9221305 0.6968686
## 0.9188990 0.6820471 0.9187154 0.6820471
## 0.9197943 0.6859712 0.9197770 0.6859712
## 0.9204721 0.6886501 0.9203551 0.6886501
## 0.9200176 0.6877577 0.9198778 0.6877577
## 0.9177955 0.6748305 0.9179368 0.6748305
## 0.9200116 0.6861916 0.9199862 0.6861916
## 0.9200052 0.6862600 0.9199433 0.6862600
## 0.9200126 0.6868126 0.9199022 0.6868126
## 0.9179791 0.6797781 0.9178800 0.6797781
## 0.9184596 0.6831139 0.9182976 0.6831139
## 0.9180015 0.6803742 0.9178800 0.6803742
## 0.9186731 0.6844319 0.9184819 0.6844319
## 0.9193372 0.6863393 0.9193355 0.6863393
## 0.9202436 0.6892992 0.9201011 0.6892992
## 0.9177829 0.6807117 0.9176653 0.6807117
## 0.9180165 0.6805412 0.9177493 0.6805412
## 0.9177955 0.6777711 0.9177016 0.6777711
## 0.9193623 0.6840214 0.9192168 0.6840214
## 0.9184609 0.6807930 0.9183958 0.6807930
## 0.9180165 0.6808212 0.9178136 0.6808212
## 0.9170977 0.6763808 0.9170865 0.6763808
## 0.9209104 0.6902085 0.9208608 0.6902085
## 0.9180116 0.6789679 0.9178508 0.6789679
## 0.9209166 0.6909596 0.9207656 0.6909596
## Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6244754 0.1251155 0.7654057
## 0.6470815 0.1295894 0.7795065
## 0.6643332 0.1329799 0.7902691
## 0.6749840 0.1351140 0.7969403
## 0.6162439 0.1235155 0.7602887
## 0.6371209 0.1276155 0.7732983
## 0.6615295 0.1324370 0.7885270
## 0.6784018 0.1358331 0.7990936
## 0.6461319 0.1293996 0.7789200
## 0.6605985 0.1322688 0.7879535
## 0.6768234 0.1354879 0.7980822
## 0.6812081 0.1363696 0.8008344
## 0.6470455 0.1295806 0.7794923
## 0.6596341 0.1320846 0.7873576
## 0.6775119 0.1356593 0.7985363
## 0.6820179 0.1365490 0.8013460
## 0.6668426 0.1335092 0.7918601
## 0.6818773 0.1365466 0.8012751
## 0.6794073 0.1360132 0.7997169
## 0.6739720 0.1349386 0.7963282
## 0.6783752 0.1358347 0.7990853
## 0.6782804 0.1358307 0.7990379
## 0.6935154 0.1388680 0.8085593
## 0.6883595 0.1377958 0.8053059
## 0.6748637 0.1351132 0.7968814
## 0.6784376 0.1358307 0.7991160
## 0.6811290 0.1363648 0.8008005
## 0.6793108 0.1360100 0.7996642
## 0.6704597 0.1342243 0.7941276
## 0.6793310 0.1360045 0.7996713
## 0.6795305 0.1360053 0.7997678
## 0.6793081 0.1360068 0.7996604
## 0.6714294 0.1343941 0.7947042
## 0.6732229 0.1347568 0.7958413
## 0.6715287 0.1343981 0.7947651
## 0.6740135 0.1349306 0.7963433
## 0.6767223 0.1354727 0.7980298
## 0.6804184 0.1361823 0.8003310
## 0.6705756 0.1342172 0.7941793
## 0.6715077 0.1344005 0.7947621
## 0.6704018 0.1342283 0.7940986
## 0.6766836 0.1354751 0.7980230
## 0.6730904 0.1347632 0.7957756
## 0.6713091 0.1344029 0.7946628
## 0.6674979 0.1336806 0.7922978
## 0.6829701 0.1367227 0.8019403
## 0.6713496 0.1343997 0.7946806
## 0.6830694 0.1367251 0.8019930
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 3, shrinkage = 0.05 and n.minobsinnode = 10.
cat("Best GBM Accuracy (Cross-Validation):", max(gbm_model$results$Accuracy), "\n")
## Best GBM Accuracy (Cross-Validation): 0.6943402
In the end we get a ~69% accuracy, which is really similar to the first model. With this we observe that Gradient Boosting may not be the best approach for this model, since there are other models that perform slightly better such as SVM or Logistic Regression.
So far, we have used Gradient Boosting for multiple-class classification, but one of the strongest points of this algorithm is regression, which we can use for this problem. Since we wanted to perfor a classification taks, in the preprocessing we divided the price of the houses into different categories. Therefore, if we fit the model for regression and obtain a value, we can then predict the class knowing the limits of the class (e.g. if the model predict a price of 75, and we know that the “High” class are all observation whose price ranges between 50 and 100, the final prediction of this model would be “High”).
We will do this implementation and see if there are any significal improvements in the classification.
# For reproducibility purposes
set.seed(123)
data_regression <- data_enc[, -25]
data_regression$SalePrice <- target_variable
# Define the quantiles for classification
quantiles <- quantile(data_regression$SalePrice, probs = seq(0, 1, 0.2), na.rm = TRUE)
# Define custom summary function for classification evaluation
classification_summary <- function(data, lev = NULL, model = NULL) {
# Convert regression predictions to categories
predicted_category <- cut(
data$pred,
breaks = quantiles,
include.lowest = TRUE,
labels = c("Very Low", "Low", "Medium", "High", "Very High")
)
# Calculate confusion matrix metrics
actual_category <- cut(
data$obs,
breaks = quantiles,
include.lowest = TRUE,
labels = c("Very Low", "Low", "Medium", "High", "Very High")
)
cm <- confusionMatrix(predicted_category, actual_category)
# Return relevant classification metrics
out <- c(Accuracy = cm$overall["Accuracy"])
return(out)
}
# Define trainControl for cross-validation
train_control <- trainControl(
method = "cv",
number = 5,
summaryFunction = classification_summary,
savePredictions = "final"
)
xgb_grid <- expand.grid(
nrounds = c(50, 100, 200), # Number of boosting rounds
max_depth = c(3, 5), # Maximum depth of trees
eta = c(0.01, 0.05, 0.1), # Learning rate
gamma = c(0, 1), # Minimum loss reduction
colsample_bytree = 0.8, # Subsample ratio of columns
min_child_weight = c(1, 3, 5), # Minimum sum of instance weights
subsample = 0.8 # Subsample ratio of training instances
)
# Train the XGBoost regression model with cross-validation
model <- train(
SalePrice ~ .,
data = data_regression,
method = "xgbTree",
trControl = train_control,
tuneGrid = xgb_grid
)
## Warning in train.default(x, y, weights = w, ...): The metric "RMSE" was not in
## the result set. Accuracy.Accuracy will be used instead.
## [19:50:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:53] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:56] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:57] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:50:59] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:00] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:01] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:04] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:05] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:06] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:08] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:09] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:11] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:14] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:16] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:19] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:20] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:22] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:23] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:25] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:27] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:28] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:31] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:32] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:34] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:37] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:38] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:42] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:43] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:44] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:45] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:46] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:47] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:48] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [19:51:52] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
# Print the results of the best model
print(model)
## eXtreme Gradient Boosting
##
## 1119 samples
## 24 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 895, 895, 895, 896, 895
## Resampling results across tuning parameters:
##
## eta max_depth gamma min_child_weight nrounds Accuracy.Accuracy
## 0.01 3 0 1 50 0.3396060
## 0.01 3 0 1 100 0.4906310
## 0.01 3 0 1 200 0.6586603
## 0.01 3 0 3 50 0.3396060
## 0.01 3 0 3 100 0.4986827
## 0.01 3 0 3 200 0.6640175
## 0.01 3 0 5 50 0.3387092
## 0.01 3 0 5 100 0.4959922
## 0.01 3 0 5 200 0.6693986
## 0.01 3 1 1 50 0.3378203
## 0.01 3 1 1 100 0.4959922
## 0.01 3 1 1 200 0.6649183
## 0.01 3 1 3 50 0.3404989
## 0.01 3 1 3 100 0.4968930
## 0.01 3 1 3 200 0.6586563
## 0.01 3 1 5 50 0.3351377
## 0.01 3 1 5 100 0.4897422
## 0.01 3 1 5 200 0.6541840
## 0.01 5 0 1 50 0.3243994
## 0.01 5 0 1 100 0.5022462
## 0.01 5 0 1 200 0.6863509
## 0.01 5 0 3 50 0.3279789
## 0.01 5 0 3 100 0.5031350
## 0.01 5 0 3 200 0.6854620
## 0.01 5 0 5 50 0.3369194
## 0.01 5 0 5 100 0.4968890
## 0.01 5 0 5 200 0.6863389
## 0.01 5 1 1 50 0.3315543
## 0.01 5 1 1 100 0.5004604
## 0.01 5 1 1 200 0.6783032
## 0.01 5 1 3 50 0.3360426
## 0.01 5 1 3 100 0.5031430
## 0.01 5 1 3 200 0.6801009
## 0.01 5 1 5 50 0.3378123
## 0.01 5 1 5 100 0.5040359
## 0.01 5 1 5 200 0.6854660
## 0.05 3 0 1 50 0.6881606
## 0.05 3 0 1 100 0.7292561
## 0.05 3 0 1 200 0.7408552
## 0.05 3 0 3 50 0.6908352
## 0.05 3 0 3 100 0.7230021
## 0.05 3 0 3 200 0.7337204
## 0.05 3 0 5 50 0.6935098
## 0.05 3 0 5 100 0.7301409
## 0.05 3 0 5 200 0.7408632
## 0.05 3 1 1 50 0.6827875
## 0.05 3 1 1 100 0.7185218
## 0.05 3 1 1 200 0.7238869
## 0.05 3 1 3 50 0.6872478
## 0.05 3 1 3 100 0.7220932
## 0.05 3 1 3 200 0.7292521
## 0.05 3 1 5 50 0.6926129
## 0.05 3 1 5 100 0.7247798
## 0.05 3 1 5 200 0.7238869
## 0.05 5 0 1 50 0.7122598
## 0.05 5 0 1 100 0.7444226
## 0.05 5 0 1 200 0.7506807
## 0.05 5 0 3 50 0.7086923
## 0.05 5 0 3 100 0.7283592
## 0.05 5 0 3 200 0.7399744
## 0.05 5 0 5 50 0.7113429
## 0.05 5 0 5 100 0.7363909
## 0.05 5 0 5 200 0.7399383
## 0.05 5 1 1 50 0.7077995
## 0.05 5 1 1 100 0.7247758
## 0.05 5 1 1 200 0.7283512
## 0.05 5 1 3 50 0.7060338
## 0.05 5 1 3 100 0.7239110
## 0.05 5 1 3 200 0.7301730
## 0.05 5 1 5 50 0.6917120
## 0.05 5 1 5 100 0.7212164
## 0.05 5 1 5 200 0.7301650
## 0.10 3 0 1 50 0.7158432
## 0.10 3 0 1 100 0.7355141
## 0.10 3 0 1 200 0.7328395
## 0.10 3 0 3 50 0.7265895
## 0.10 3 0 3 100 0.7301610
## 0.10 3 0 3 200 0.7408873
## 0.10 3 0 5 50 0.7238829
## 0.10 3 0 5 100 0.7230061
## 0.10 3 0 5 200 0.7230061
## 0.10 3 1 1 50 0.7078195
## 0.10 3 1 1 100 0.7140815
## 0.10 3 1 1 200 0.7105021
## 0.10 3 1 3 50 0.7238749
## 0.10 3 1 3 100 0.7301329
## 0.10 3 1 3 200 0.7274544
## 0.10 3 1 5 50 0.7006566
## 0.10 3 1 5 100 0.7095972
## 0.10 3 1 5 200 0.7113829
## 0.10 5 0 1 50 0.7211963
## 0.10 5 0 1 100 0.7265535
## 0.10 5 0 1 200 0.7328035
## 0.10 5 0 3 50 0.7212124
## 0.10 5 0 3 100 0.7337124
## 0.10 5 0 3 200 0.7319307
## 0.10 5 0 5 50 0.7292681
## 0.10 5 0 5 100 0.7399864
## 0.10 5 0 5 200 0.7399744
## 0.10 5 1 1 50 0.7059938
## 0.10 5 1 1 100 0.7051129
## 0.10 5 1 1 200 0.7042160
## 0.10 5 1 3 50 0.7167281
## 0.10 5 1 3 100 0.7185218
## 0.10 5 1 3 200 0.7212044
## 0.10 5 1 5 50 0.7283792
## 0.10 5 1 5 100 0.7292721
## 0.10 5 1 5 200 0.7292721
##
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## Accuracy.Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 200, max_depth = 5, eta
## = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
## subsample = 0.8.
# View cross-validation results (Accuracy for classification)
results <- model$resample
print(results)
## Accuracy.Accuracy Resample
## 1 0.7767857 Fold5
## 2 0.7410714 Fold3
## 3 0.7857143 Fold2
## 4 0.6875000 Fold1
## 5 0.7623318 Fold4
# Final classification metrics from cross-validation
mean_accuracy <- mean(results$Accuracy)
cat("Mean Classification Accuracy (via CV):", mean_accuracy, "\n")
## Mean Classification Accuracy (via CV): 0.7506807
We observe a very drastical improvement with ~75% accuracy, the highest yet. With this we prove that Gradient Boosting is still one of the most robust methods, but we just have to take the right approach.
Although the results are very good, we will not take this model into account. The reason being that, if we were a company trying to earn a profit from predicting the house prices, and we had access to the real value of the houses we would use that instead of separating into classes. This is why we will “forget” that we have the precise valuation of the house and just use the first model we fitted.
Since XGBoost is a black-box model it is difficult to interpret its results. What we will do is make a plot to see how each feature impact the prediction of the model and compare it to previous results.
# Extract the feature names from the trained model
feature_names <- xgb_model$finalModel$feature_names
# Compute feature importances using xgb.importance
importance_df <- xgb.importance(
feature_names = feature_names,
model = xgb_model$finalModel
)
# Select the top 30 most important features
top_30_features <- importance_df %>%
arrange(desc(Gain)) %>%
head(30)
# Plot the top 30 feature importances
ggplot(top_30_features, aes(x = reorder(Feature, Gain), y = Gain)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
coord_flip() +
ggtitle('Top 30 Feature Importances from XGBoost') +
xlab('Features') +
ylab('Gain') +
theme_minimal()
We observe the usual quality and area related variables as the most important. Most importantly, we also see that variables related to the basement seem to have a reasonable impact in the classification. Again this will be useful, since predictions that complement each other are best for an ensemble.
So far we have tried several algorithms for classification with some performing better than others. We have hints to know which algorithms are better than others, but in reality we do not know if one method will perform better than other on unseen data when the difference in performance is minimal. Because of this, we can benefit from an ensemble, so that our prediction is done by many models, meaning that we will reduce the variance in our prediction than with just one single algorithm.
With this logic, we will select the best performing models we have tried, these were: Kernel SVM, Ordinal Logistic Regression, Regularized Discriminant Analysis and XGBoost. Ordinal Logistic Regression obtained a high accuracy value but when creating the model it led to many errors mostly referring to not found initial values, as we were short on time and could not find a solution, we removed it from our ensemble model to avoid problems. We will train them with the optimal hyper-parameters found before.
To assess the ensemble’s accuracy, we will do cross-validation and check its performance.
# For reproducibility purposes
set.seed(123)
# Define the mapping
class_mapping <- c("Very.Low" = 1, "Low" = 2, "Medium" = 3, "High" = 4, "Very.High" = 5)
# Function to convert class predictions to numeric values
convert_to_numeric <- function(predictions) {
as.numeric(class_mapping[as.character(predictions)])
}
# Final hyperparameters for each model
hyperparams_rda <- list(gamma = 0, lambda = 0.9)
hyperparams_ksvm <- list(C = 2.5, sigma = 0.03)
hyperparams_xgboost <- list(nrounds = 200, eta = 0.1, max_depth = 5, colsample_bytree = 0.8, gamma = 1, min_child_weight = 3, subsample = 0.8)
# Create folds for cross-validation
folds <- createFolds(data_enc$SalePrice_Category, k = 5, returnTrain = TRUE)
# Initialize vector to store accuracies
cv_accuracies <- numeric(length(folds))
for (i in seq_along(folds)) {
cat("Fold: ", i, " ")
# Get train and test indices
train_indices <- folds[[i]]
test_indices <- setdiff(seq_along(data_enc$SalePrice_Category), train_indices)
# Split the data
train_data_cv <- data_enc[train_indices, ]
test_data_cv <- data_enc[test_indices, ]
test_labels_cv <- test_data_cv[, 25]
test_data_cv <- test_data_cv[, -25]
cat("RDA ")
# Train individual models on the training fold
final_rda_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "rda",
tuneGrid = expand.grid(hyperparams_rda)
)
cat("KSVM ")
final_ksvm_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "svmRadial",
tuneGrid = expand.grid(hyperparams_ksvm)
)
levels(train_data_cv$SalePrice_Category) <- make.names(levels(train_data_cv$SalePrice_Category))
cat("XGBoost ")
final_xgboost_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "xgbTree",
tuneGrid = expand.grid(hyperparams_xgboost),
metric = "Accuracy"
)
# Generate predictions on the test fold
pred_rda <- convert_to_numeric(predict(final_rda_cv, test_data_cv))
pred_ksvm <- convert_to_numeric(predict(final_ksvm_cv, test_data_cv))
pred_xgboost <- convert_to_numeric(predict(final_xgboost_cv, test_data_cv))
# Calculate ensemble predictions (average and round)
average_prediction <- (pred_rda + pred_ksvm + pred_xgboost) / 3
final_numeric_pred <- round(average_prediction)
# Convert back to class labels
final_class_pred <- names(class_mapping)[final_numeric_pred]
# Calculate accuracy for the fold
cv_accuracies[i] <- mean(as.factor(final_class_pred) == test_labels_cv)
}
## Fold: 1 RDA KSVM XGBoost Fold: 2 RDA KSVM XGBoost Fold: 3 RDA KSVM XGBoost Fold: 4 RDA KSVM XGBoost Fold: 5 RDA KSVM XGBoost
# Overall cross-validated accuracy
mean_cv_accuracy <- mean(cv_accuracies)
print(mean_cv_accuracy)
## [1] 0.7337501
With cross-validation we obtained and accuracy of ~73% in the ensemble which is the highest yet. Now we will finally train the ensemble on the whole training set.
To check that the feature extraction we will train the algorithm selecting instead of 15, 25 numerical columns and observe their difference in accuracy.
cat_features <- c("ExterQual", "FireplaceQu", "BsmtQual", "KitchenQual", "HouseStyle", "MSZoning", "Neighborhood", "BldgType", "GarageType", "SalePrice_Category")
num_features <- final_scaled_data[, selected_num_features2]
cat_features <- data_enc[, cat_features]
new_data <- cbind(num_features, cat_features)
# For reproducibility purposes
set.seed(123)
# Define the mapping
class_mapping <- c("Very.Low" = 1, "Low" = 2, "Medium" = 3, "High" = 4, "Very.High" = 5)
# Function to convert class predictions to numeric values
convert_to_numeric <- function(predictions) {
as.numeric(class_mapping[as.character(predictions)])
}
# Final hyperparameters for each model
hyperparams_rda <- list(gamma = 0, lambda = 0.9)
hyperparams_ksvm <- list(C = 2.5, sigma = 0.03)
hyperparams_xgboost <- list(nrounds = 200, eta = 0.1, max_depth = 5, colsample_bytree = 0.8, gamma = 1, min_child_weight = 3, subsample = 0.8)
# Create folds for cross-validation
folds <- createFolds(new_data$SalePrice_Category, k = 5, returnTrain = TRUE)
# Initialize vector to store accuracies
cv_accuracies <- numeric(length(folds))
for (i in seq_along(folds)) {
cat("Fold: ", i, " ")
# Get train and test indices
train_indices <- folds[[i]]
test_indices <- setdiff(seq_along(new_data$SalePrice_Category), train_indices)
# Split the data
train_data_cv <- new_data[train_indices, ]
test_data_cv <- new_data[test_indices, ]
test_labels_cv <- test_data_cv$SalePrice_Category
test_data_cv <- test_data_cv[, -which(names(test_data_cv) == "SalePrice_Category")]
cat("RDA ")
# Train individual models on the training fold
final_rda_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "rda",
tuneGrid = expand.grid(hyperparams_rda)
)
cat("KSVM ")
final_ksvm_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "svmRadial",
tuneGrid = expand.grid(hyperparams_ksvm)
)
levels(train_data_cv$SalePrice_Category) <- make.names(levels(train_data_cv$SalePrice_Category))
cat("XGBoost ")
final_xgboost_cv <- train(
SalePrice_Category ~ .,
data = train_data_cv,
method = "xgbTree",
tuneGrid = expand.grid(hyperparams_xgboost),
metric = "Accuracy"
)
# Generate predictions on the test fold
pred_rda <- convert_to_numeric(predict(final_rda_cv, test_data_cv))
pred_ksvm <- convert_to_numeric(predict(final_ksvm_cv, test_data_cv))
pred_xgboost <- convert_to_numeric(predict(final_xgboost_cv, test_data_cv))
# Calculate ensemble predictions (average and round)
average_prediction <- (pred_rda + pred_ksvm + pred_xgboost) / 3
final_numeric_pred <- round(average_prediction)
# Convert back to class labels
final_class_pred <- names(class_mapping)[final_numeric_pred]
# Calculate accuracy for the fold
cv_accuracies[i] <- mean(as.factor(final_class_pred) == test_labels_cv)
}
## Fold: 1 RDA KSVM XGBoost Fold: 2 RDA KSVM XGBoost Fold: 3 RDA KSVM XGBoost Fold: 4 RDA KSVM XGBoost Fold: 5 RDA KSVM XGBoost
# Overall cross-validated accuracy
mean_cv_accuracy <- mean(cv_accuracies)
print(mean_cv_accuracy)
## [1] 0.7310316
After training we even see a slight drop in accuracy. Since they are mostly the same we can conclude that adding those new 10 features does not really have an impact on classification, and therefore the feature extraction has been successful, reducing the complexity of the problem.
Now we will train the ensemble on the whole training data.
# For reproducibility purposes
set.seed(123)
# Train the models on the entire dataset
final_rda <- train(
SalePrice_Category ~ .,
data = data_enc,
method = "rda",
tuneGrid = expand.grid(hyperparams_rda)
)
final_ksvm <- train(
SalePrice_Category ~ .,
data = data_enc,
method = "svmRadial",
tuneGrid = expand.grid(hyperparams_ksvm),
probability = TRUE
)
final_xgboost <- train(
SalePrice_Category ~ .,
data = data_enc,
method = "xgbTree",
tuneGrid = expand.grid(hyperparams_xgboost),
metric = "Accuracy"
)
Building the function to make predictions of the ensemble.
# Function to generate ensemble predictions and evaluate performance
ensemble_predict <- function(test_data) {
test_labels <- test_data$SalePrice_Category
test_data <- test_data[, -which(names(test_data) == "SalePrice_Category")]
# Generate predictions for each model
pred_rda <- convert_to_numeric(predict(final_rda, test_data))
pred_ksvm <- convert_to_numeric(predict(final_ksvm, test_data))
pred_xgboost <- convert_to_numeric(predict(final_xgboost, test_data))
# Calculate ensemble predictions (average and round)
average_prediction <- (pred_rda + pred_ksvm + pred_xgboost) / 3
final_numeric_pred <- round(average_prediction)
# Convert back to class labels
final_class_pred <- as.factor(names(class_mapping)[final_numeric_pred])
# Calculate accuracy and confusion matrix
accuracy <- mean(final_class_pred == test_labels)
# Generate confusion matrix
cm <- confusionMatrix(final_class_pred, test_labels)
# Return both accuracy and confusion matrix
return(list(accuracy = accuracy, confusion_matrix = cm))
}
# Example usage
ensemble_predict(data_enc)
## Warning in confusionMatrix.default(final_class_pred, test_labels): Levels are
## not in the same order for reference and data. Refactoring data to match.
## $accuracy
## [1] 0.8588025
##
## $confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very.Low Low Medium High Very.High
## Very.Low 198 16 0 0 0
## Low 26 191 20 0 0
## Medium 2 18 176 27 0
## High 0 2 22 192 20
## Very.High 0 0 1 4 204
##
## Overall Statistics
##
## Accuracy : 0.8588
## 95% CI : (0.837, 0.8787)
## No Information Rate : 0.2029
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8235
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very.Low Class: Low Class: Medium Class: High
## Sensitivity 0.8761 0.8414 0.8037 0.8610
## Specificity 0.9821 0.9484 0.9478 0.9509
## Pos Pred Value 0.9252 0.8059 0.7892 0.8136
## Neg Pred Value 0.9691 0.9592 0.9520 0.9649
## Prevalence 0.2020 0.2029 0.1957 0.1993
## Detection Rate 0.1769 0.1707 0.1573 0.1716
## Detection Prevalence 0.1912 0.2118 0.1993 0.2109
## Balanced Accuracy 0.9291 0.8949 0.8757 0.9059
## Class: Very.High
## Sensitivity 0.9107
## Specificity 0.9944
## Pos Pred Value 0.9761
## Neg Pred Value 0.9780
## Prevalence 0.2002
## Detection Rate 0.1823
## Detection Prevalence 0.1868
## Balanced Accuracy 0.9526
We will now create some graphs to interpret this model and obtain insights about the data.
We will first obtained the feature importance taking into account the feature importance from each of these models. This is obtained as the mean value each model returns for the features.
As SVM is an algorithm which does not return probabilities we will not take it into account for some of the plots. We will still make them to see if we can obtain more information about or model but keeping in mind that we are missing part of our model.
# Computing feature importance for each model using varImp
importance_rda <- varImp(final_rda)$importance
importance_ksvm <- varImp(final_ksvm)$importance
importance_xgboost <- varImp(final_xgboost)$importance
# Combining and averaging importance scores
importance_df <- data.frame(
Feature = rownames(importance_rda),
RDA = rowMeans(as.data.frame(importance_rda)),
KSVM = rowMeans(as.data.frame(importance_ksvm)),
XGBoost = rowMeans(as.data.frame(importance_xgboost))
)
# Adding an Ensemble importance score (mean importance across all models)
importance_df$Ensemble <- rowMeans(importance_df[, c("RDA", "KSVM", "XGBoost")])
# Ordering the features with respect to the importance value
top_features <- importance_df %>%
arrange(desc(Ensemble))
# Plotting the importance
ggplot(top_features, aes(x = reorder(Feature, Ensemble), y = Ensemble)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
ggtitle("Feature Importances from Ensemble Model") +
xlab("Features") +
ylab("Importance") +
theme_minimal()
As seen with most of our previously trained models the models give a big importance to both Area and Quality which makes sense as they are two categories people take into account when buying houses. Moreover, BldgType and MSSubClass don’t seem to have a really high significance as the rest of models.
To conclude with the project we will test the model on the testing set we saved at the start. For this, we have to make the same transformations we did earlier to this new data.
# 1. Removing the same columns
na_columns <- c("Id", "Alley","PoolQC","Fence", "MiscFeature")
testing <- testing[, !(colnames(testing) %in% na_columns)]
# 2. Handling missing values
## 2.1 Numerical data
numeric_data_test <- select_if(testing, is.numeric)
numeric_data_test <- numeric_data_test %>%
mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
## 2.2 Categorical data
na_columns <- c("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1","BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual","GarageCond")
categorical_data_test <- select_if(testing, is.character)
categorical_data_test[na_columns][is.na(categorical_data_test[na_columns])] <- "None"
imputed_categorical_data_test <- VIM::kNN(categorical_data_test,
variable = c("MasVnrType", "Electrical"),
k = 10)
categorical_data_test$MasVnrType <- imputed_categorical_data_test$MasVnrType
categorical_data_test$Electrical <- imputed_categorical_data_test$Electrical
# 3. Handling problematic columns
problematic_cols <- c("LowQualFinSF", "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch")
problematic_data_test <- numeric_data_test[, problematic_cols]
new_numeric_data_test <- numeric_data_test[, !(names(numeric_data_test) %in% problematic_cols)]
# 4. Scaling numeric data
# Apply the saved bestNormalize transformations to the test set
scaled_num_data_test <- as.data.frame(lapply(names(new_numeric_data_test), function(col) {
if (col %in% names(training_bestNormalize)) {
predict(training_bestNormalize[[col]], newdata = new_numeric_data_test[[col]])
} else {
stop(paste("No transformation found for column:", col))
}
}))
## Warning in orderNorm_trans(object, newdata[!na_idx], warn): Transformations
## requested outside observed domain; logit approx. on ranks applied
## Warning in orderNorm_trans(object, newdata[!na_idx], warn): Transformations
## requested outside observed domain; logit approx. on ranks applied
## Warning in predict.double_reverse_log(object$chosen_transform, newdata =
## newdata, : Se han producido NaNs
## Warning in orderNorm_trans(object, newdata[!na_idx], warn): Transformations
## requested outside observed domain; logit approx. on ranks applied
names(scaled_num_data_test) <- names(new_numeric_data_test)
scaled_problematic_test <- as.data.frame(scale(problematic_data_test))
final_scaled_data_test <- cbind(scaled_num_data_test, scaled_problematic_test)
# 5. Feature selection
reduced_test_data <- final_scaled_data_test[, selected_num_features]
reduced_test_data$SalePrice <- final_scaled_data_test$SalePrice
categorical_data_test <- categorical_data_test[, selected_cat_features]
# 6. Encoding categorical columns
## 6.1 Label encoding
categorical_data_test$ExterQual <- as.integer(factor(categorical_data_test$ExterQual, levels = c("Po", "Fa", "TA", "Gd", "Ex")))
categorical_data_test$FireplaceQu <- as.integer(factor(categorical_data_test$FireplaceQu, levels = c("None", "Po", "Fa", "TA", "Gd", "Ex")))
categorical_data_test$BsmtQual <- as.integer(factor(categorical_data_test$BsmtQual, levels = c("None", "Fa", "TA", "Gd", "Ex")))
categorical_data_test$KitchenQual <- as.integer(factor(categorical_data_test$KitchenQual, levels = c("Fa", "TA", "Gd", "Ex")))
## 6.2 Target encoding
categorical_columns <- c("HouseStyle", "MSZoning", "Neighborhood", "BldgType", "GarageType")
for (col in categorical_columns) {
# Extract the tibble for the current column from training_target_means
target_map <- training_target_means[[col]]
# Ensure the tibble has the expected structure (first column: category, second column: mean target)
if (nrow(target_map) == 0) {
warning(paste("Target map for column", col, "is empty. Skipping."))
next
}
# Create a named vector where names are categories and values are mean targets
target_values <- target_map[[2]] # Second column is the mean target
names(target_values) <- as.character(target_map[[1]]) # First column is the category name
# Convert the test set column to character to ensure proper matching
categorical_data_test[[col]] <- as.character(categorical_data_test[[col]])
# Perform the mapping using `match`
mapped_indices <- match(categorical_data_test[[col]], names(target_values))
mapped_values <- target_values[mapped_indices]
# Replace any NA values (categories not found in training) with the fallback mean
fallback_mean <- mean(target_values, na.rm = TRUE)
mapped_values[is.na(mapped_values)] <- fallback_mean
# Assign the mapped values back to the test set
categorical_data_test[[col]] <- mapped_values
}
categorical_data_test <- as.data.frame(scale(categorical_data_test))
# 7. Category creation
labels <- c("Very.Low", "Low", "Medium", "High", "Very.High")
data_enc_test <- cbind(reduced_test_data, categorical_data_test)
data_enc_test$SalePrice_Category <- cut(data_enc_test$SalePrice,
breaks = training_saleprice_breaks,
labels = labels,
include.lowest = TRUE)
# 8. Final dataset
final_test_data <- data_enc_test[, -which(names(data_enc_test) == "SalePrice")]
final_test_data <- na.omit(final_test_data) # Drop rows with NA
With the test data being processed we can now make predictions with them.
ensemble_predict(final_test_data)
## Warning in confusionMatrix.default(final_class_pred, test_labels): Levels are
## not in the same order for reference and data. Refactoring data to match.
## $accuracy
## [1] 0.7222222
##
## $confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very.Low Low Medium High Very.High
## Very.Low 46 3 0 0 0
## Low 14 45 18 1 0
## Medium 1 5 24 10 0
## High 0 1 8 49 14
## Very.High 0 1 0 4 44
##
## Overall Statistics
##
## Accuracy : 0.7222
## 95% CI : (0.6666, 0.7732)
## No Information Rate : 0.2222
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.652
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very.Low Class: Low Class: Medium Class: High
## Sensitivity 0.7541 0.8182 0.48000 0.7656
## Specificity 0.9868 0.8584 0.93277 0.8973
## Pos Pred Value 0.9388 0.5769 0.60000 0.6806
## Neg Pred Value 0.9372 0.9524 0.89516 0.9306
## Prevalence 0.2118 0.1910 0.17361 0.2222
## Detection Rate 0.1597 0.1562 0.08333 0.1701
## Detection Prevalence 0.1701 0.2708 0.13889 0.2500
## Balanced Accuracy 0.8704 0.8383 0.70639 0.8315
## Class: Very.High
## Sensitivity 0.7586
## Specificity 0.9783
## Pos Pred Value 0.8980
## Neg Pred Value 0.9414
## Prevalence 0.2014
## Detection Rate 0.1528
## Detection Prevalence 0.1701
## Balanced Accuracy 0.8684
We finally obtain a ~72 % accuracy, which is really close to the one we estimated with cross validation. With this we can see that the model we obtained performs as expected and well on unseen data.
Next, we will compute the ROCs and the AUCs for the testing data, to interpret and see how the model has performed.
# Function to calculate predicted probabilities for the test set
ensemble_predict_probabilities_train <- function(test_data) {
test_labels <- test_data$SalePrice_Category
test_data <- test_data[, -which(names(test_data) == "SalePrice_Category")]
# Generate probability predictions for each model (RDA, SVM, XGBoost)
prob_rda <- predict(final_rda, test_data, type = "prob")
prob_xgboost <- predict(final_xgboost, test_data, type = "prob")
# Calculate ensemble probabilities as the mean of probabilities from each model
ensemble_probs <- (prob_rda + prob_xgboost) / 2
return(list(probs = ensemble_probs, labels = test_labels))
}
# Get ensemble probabilities and true labels for the test set
ensemble_results_test <- ensemble_predict_probabilities_train(final_test_data)
ensemble_probs_test <- ensemble_results_test$probs
true_labels_test <- ensemble_results_test$labels
# Create a list to store ROC curves for each class (One-vs-Rest)
roc_list_ensemble_test <- list()
# Get the levels of the true labels (class names)
class_levels_test <- levels(true_labels_test)
# Calculate ROC curve for each class using One-vs-Rest approach
for (i in 1:length(class_levels_test)) {
# Treat the current class as positive (1) and all others as negative (0)
binary_true_labels <- ifelse(true_labels_test == class_levels_test[i], 1, 0)
# Calculate ROC curve for the current class
roc_list_ensemble_test[[i]] <- roc(binary_true_labels, ensemble_probs_test[, i], quiet = TRUE)
}
# Plot ROC curves using ggroc
roc_plot_ensemble_test <- ggroc(roc_list_ensemble_test, aes = "color") +
ggtitle("One-vs-Rest ROC Curves for Ensemble Model (Test Set)") +
xlab("False Positive Rate") +
ylab("True Positive Rate") +
theme_minimal()
# Print the ROC curves and the plot
print(roc_list_ensemble_test)
## [[1]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = ensemble_probs_test[, i], quiet = TRUE)
##
## Data: ensemble_probs_test[, i] in 227 controls (binary_true_labels 0) < 61 cases (binary_true_labels 1).
## Area under the curve: 0.9789
##
## [[2]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = ensemble_probs_test[, i], quiet = TRUE)
##
## Data: ensemble_probs_test[, i] in 233 controls (binary_true_labels 0) < 55 cases (binary_true_labels 1).
## Area under the curve: 0.9111
##
## [[3]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = ensemble_probs_test[, i], quiet = TRUE)
##
## Data: ensemble_probs_test[, i] in 238 controls (binary_true_labels 0) < 50 cases (binary_true_labels 1).
## Area under the curve: 0.8707
##
## [[4]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = ensemble_probs_test[, i], quiet = TRUE)
##
## Data: ensemble_probs_test[, i] in 224 controls (binary_true_labels 0) < 64 cases (binary_true_labels 1).
## Area under the curve: 0.9293
##
## [[5]]
##
## Call:
## roc.default(response = binary_true_labels, predictor = ensemble_probs_test[, i], quiet = TRUE)
##
## Data: ensemble_probs_test[, i] in 230 controls (binary_true_labels 0) < 58 cases (binary_true_labels 1).
## Area under the curve: 0.9813
print(roc_plot_ensemble_test)
We obtain really high values for the AUC. We can also see that the model struggles the most with the middle classes especially, with the Medium class. Overall we observe that we have a really robust model that performs well on unseen data.
Finally, we will make a local SHAP interpretation for an observation on the dataset. We can only use RDA and XGBoost, since SVM cannot output probabilities, as explained in its introduction.
# Prediction wrapper
pfun <- function(object, newdata) {
prob <- predict(object, newdata = newdata, type = "prob")
return(prob[1, "Very.High"]) # Selecting the probability for "Very.High"
}
# Training features
X <- data_enc[, -which(names(data_enc) == "SalePrice_Category")]
# Single observation
house1 <- final_test_data[8, -which(names(final_test_data) == "SalePrice_Category")]
# Compute SHAP values of XGBoost
ex.house1.adj.xgboost <- fastshap::explain(
object = final_xgboost,
X = X,
pred_wrapper = pfun,
newdata = house1,
nsim = 50,
adjust = F
)
# Compute SHAP values of RDA
ex.house1.adj.rda <- fastshap::explain(
object = final_rda,
X = X,
pred_wrapper = pfun,
newdata = house1,
nsim = 50,
adjust = F
)
# Averaging SHAP values
ex.house1.adj <- (ex.house1.adj.xgboost + ex.house1.adj.rda)/2
# Plotting the results
shv <- shapviz(ex.house1.adj,
X = house1)
sv_waterfall(shv)
We see that for this observation, most features suggest that the category is “Very High”, especially the quality and some area ones. However, low 2nd floor area and other variables drive the probability that the category is “Very High” back. In fact, this observation’s true class was “Very High”, meaning the model predicted well.
To conclude we performed each step required. Firstly preprocessed and analyzed the data, in which we did some feature selection, then we tried every algorithm focusing both on predicting and interpreting its results. With this knowledge we then built an ensemble with the best algorithms as our final model. We finally tested the model in unseen data and concluded that it performed well, as well as making interpretations about the model.
Just as in the last project, we wanted to focus not only on the coding but also understanding as much as possible the algorithms we trained and the decisions we took. This meant diving deep in some mathematical concepts and explanations, but that way we felt confident in our choices.